Пример #1
1
def show_options(id):
    r = requests.get("https://interaktiv.mx.dk/toolbox/" + votetype + "/get/" + id)
    soup2 = BeautifulSoup(r.text, "lxml")

    clear_console()
    print_logo()
    print "(Interaktiv version. Kør scriptet med -h eller --help for flere indstillinger.)"
    print

    vote_text = soup2.find("div", attrs={"id": "vote_text"}).text
    print vote_text
    print

    if votetype == "advancedvotes":
            for option in soup2.find_all("div", attrs={"class": "vote_button"}):

                number = option.get("data-vote")
                text = option.text

                print "(%s) %s" % (number, text)
            print

    else:

            for option in soup2.find_all("div", attrs={"class": "vote_button"}):
                if option.get("id") == "vote_yes":
                    number = "1"

                else:
                    number = "0"

                text = option.text
                print "(%s) %s" % (number, text)
            print
Пример #2
1
def replace_links_with_text(html):
    """any absolute links will be replaced with the
    url in plain text, same with any img tags
    """
    soup = BeautifulSoup(html, 'html5lib')
    abs_url_re = r'^http(s)?://'

    images = soup.find_all('img')
    for image in images:
        url = image.get('src', '')
        text = image.get('alt', '')
        if url == '' or re.match(abs_url_re, url):
            image.replaceWith(format_url_replacement(url, text))

    links = soup.find_all('a')
    for link in links:
        url = link.get('href', '')
        text = ''.join(link.text) or ''

        if text == '':  # this is due to an issue with url inlining in comments
            link.replaceWith('')
        elif url == '' or re.match(abs_url_re, url):
            link.replaceWith(format_url_replacement(url, text))

    return force_text(soup.find('body').renderContents(), 'utf-8')
Пример #3
0
def write_into_text_nw(url):
    order_links = get_hyper_links(url, 'sxgzjl/2015')
    for link in order_links:
        link = NWORIGINURL + link
        data = get_page_source(link)
        if data[0] == "200":
            soup = BeautifulSoup(data[1])
            try:
                for page_title in soup.find_all("b"):
                    if page_title.string is not None:
                        pre_title = page_title.string + '.txt'
                        pre_title = pre_title.replace("\r\n", "")
                        print pre_title
                        html_file = codecs.open(pre_title, 'wb', 'utf-8') 
                        html_file.write(unicode(page_title.string))  
                        html_file.write('\n')
                for page_content in soup.find_all("p"):
                    page_content_unicode = unicode(page_content.get_text())
                    html_file.write(page_content_unicode)   
                    html_file.write('\n') 
                html_file.close()  

            except Exception,e:
                print str(e)
                # return None     
                continue      
Пример #4
0
def from_pmml(self, pmml):
    """Returns a model with the intercept and coefficients represented in PMML file."""

    model = self()
    
    # Reads the input PMML file with BeautifulSoup.
    with open(pmml, "r") as f:
        lm_soup = BeautifulSoup(f, "xml")

    if not lm_soup.RegressionTable:
        raise ValueError("RegressionTable not found in the input PMML file.")

    else:
    ##### DO I WANT TO PULL THIS OUT AS ITS OWN FUNCTION? #####
        # Pulls out intercept from the PMML file and assigns it to the
        # model. If the intercept does not exist, assign it to zero.
        intercept = 0
        if "intercept" in lm_soup.RegressionTable.attrs:
            intercept = lm_soup.RegressionTable['intercept']
        model.intercept_ = float(intercept)

        # Pulls out coefficients from the PMML file, and assigns them
        # to the model.
        if not lm_soup.find_all('NumericPredictor'):
            raise ValueError("NumericPredictor not found in the input PMML file.")
        else:
            coefs = []
            numeric_predictors = lm_soup.find_all('NumericPredictor')
            for i in numeric_predictors:
                i_coef = float(i['coefficient'])
                coefs.append(i_coef)
            model.coef_ = numpy.array(coefs)
            
    return model
Пример #5
0
def search(engine, searchurl, Length):
	print u'正在分析搜索结果页面'
	if engine=="Bing":
		only_a_tags = SoupStrainer("li", class_="b_algo")
		a = urllib.urlopen('http://cn.bing.com/search?q="'+searchurl+'"').read()
	elif engine == "Baidu":
		only_a_tags = SoupStrainer("h3")
		a = urllib.urlopen('http://www.baidu.com/s?wd="'+searchurl+'"&rn='+Length).read()
	elif engine == "Google":
		only_a_tags = SoupStrainer("h3")
		a = urllib.urlopen('http://www.google.com/search?num='+Length+'&q="'+searchurl+'"').read()
	html = cleaner.clean_html(a)
	soup = BeautifulSoup(html, "lxml", parse_only=only_a_tags)
	if engine=="Bing":
		tag = soup.find_all("li", class_="b_algo")
	elif engine == "Baidu" or engine == "Google":
		tag = soup.find_all("h3")
	group = []
	for item in tag:
		group.append(item.a)
	if len(group)==0:
		return [['/'],[u'您搜索的关键字没有结果。点击此链接返回首页。']]
	else:
		url=['' for col in range(len(group))]
		title=['' for col in range(len(group))]
		for k in range(0,len(group)):
			if engine=="Bing":
				url[k]=group[k]['href']
			elif engine == "Baidu":
				url[k]='http:'+group[k]['href']
			elif engine == "Google":
				url[k]='https://www.google.com'+group[k]['href']
			title[k]=group[k].get_text().strip()
		print u'分析页面已完成。共有', len(group),u'个页面需要提取'
		return [url, title]
Пример #6
0
def get_text_from_html(html_text):
    """Returns the content part from an HTML document
    retains links and references to images and line breaks.
    """
    soup = BeautifulSoup(html_text, 'html5lib')

    # replace <a> links with plain text
    links = soup.find_all('a')
    for link in links:
        url = link.get('href', '')
        text = ''.join(link.text) or ''
        link.replaceWith(format_url_replacement(url, text))

    # replace <img> tags with plain text
    images = soup.find_all('img')
    for image in images:
        url = image.get('src', '')
        text = image.get('alt', '')
        image.replaceWith(format_url_replacement(url, text))

    # extract and join phrases
    body_element = soup.find('body')
    filter_func = lambda s: bool(s.strip())
    phrases = map(
        lambda s: s.strip(),
        filter(filter_func, body_element.get_text().split('\n'))
    )
    return '\n\n'.join(phrases)
Пример #7
0
def moderate_tags(html):
    """replaces instances of <a> and <img>
    with "item in moderation" alerts
    """
    from askbot.conf import settings
    soup = BeautifulSoup(html, 'html5lib')
    replaced = False
    if settings.MODERATE_LINKS:
        links = soup.find_all('a')
        if links:
            template = get_template('widgets/moderated_link.jinja')
            aviso = BeautifulSoup(template.render(), 'html5lib').find('body')
            map(lambda v: v.replaceWith(aviso), links)
            replaced = True

    if settings.MODERATE_IMAGES:
        images = soup.find_all('img')
        if images:
            template = get_template('widgets/moderated_link.jinja')
            aviso = BeautifulSoup(template.render(), 'html5lib').find('body')
            map(lambda v: v.replaceWith(aviso), images)
            replaced = True

    if replaced:
        return force_text(soup.find('body').renderContents(), 'utf-8')

    return html
Пример #8
0
def spider_search(url):
    print url
    out_name = url.split('|')[1]
    url = url.split('|')[0]

    r = requests.get(url)
    if r.status_code == 200:
        pass
    else:
        print '访问页面错误: ' + r.status_code
        return
    source = BeautifulSoup(r.content, 'html.parser')

    # 保存源网页
    with codecs.open(out_name+'.html', 'w', 'utf-8') as f:
        f.write(source.prettify())

    content = source.find_all('a', target='_blank')
    links = source.find_all('a')

    for item in links:
        if 'title' in item.attrs and 'href' in item.attrs:
            name = item['title']
            link = url + item['href'] + '/articles/' + '|' + name
            global red
            red.lpush(settings.DBname, link)

    spider_content_search(content, out_name+'.baike')
Пример #9
0
    def parse(self, response):
    	url = response.url
    	_type = self.get_type_from_url(url)
    	items = []
    	try:
            response = response.body
            soup = BeautifulSoup(response)
            links = soup.find_all(class_=re.compile('post-area'))
        except:
            items.append(self.make_requests_from_url(url))
            log.msg("Page " + url + " parse ERROR, try again !", level=log.ERROR)
            return items
        need_parse_next_page = True
        if len(links) > 0:
        	for i in range(0, len(links)):
        		url_news = 'http://www.nanzao.com' + links[i].h2.a['href']
        		title = links[i].h2.a.text.strip()
        		day = links[i].time['datetime'].replace('-', '')
        		need_parse_next_page = self.is_news_not_saved(title, url_news)
        		if not need_parse_next_page:
					break
        		items.append(self.make_requests_from_url(url_news).replace(callback=self.parse_news, meta={'_type': _type, 'day': day, 'title': title}))
        	if u'下一頁>' in soup.find(class_='paging').text:
				page_next = 'http://www.nanzao.com' + soup.find_all("a", text=u"下一頁>")[0]['href']
				if need_parse_next_page:
					items.append(self.make_requests_from_url(page_next))
        	return items
Пример #10
0
class PageAnalytic(object):
    headers={"User-Agent": "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11",
             "Accept-Encoding": "gzip"}
    tagsClass = TagProcessor.__subclasses__()

    def __init__(self, url):
        self.base_url = url
        response = requests.get(url, headers=self.headers)
        self.html_size = int(response.headers["content-length"])
        self.page_object = BeautifulSoup(response.content)
        self._tags = []

    def _get_tag_size(self, tag):
        for subCls in self.tagsClass:
            if subCls.is_resource(tag):
                self._tags.append(subCls(tag, base_url=self.base_url))
                return True
        else:
            return False

    def get_size_tag(self):
        size = 0
        self.page_object.find_all(self._get_tag_size)
        for tag in self._tags:
            #threading?
            resp = requests.head(tag.get_resource_url(), headers=self.headers)
            while resp.status_code == 301 or resp.status_code == 302:
                resp = requests.head(resp.headers["location"])
            if resp.status_code == 200:
                size += int(resp.headers["content-length"])
        self._tags = []
        return size

    def get_page_size(self):
        return self.html_size + self.get_size_tag()
Пример #11
0
	def __init__(self, username):

		# go out with name and check if it's real, populate the variable
		self.name = username
		self.u_url = "http://www.pinterest.com/" + username

		#go out and parse html to find board names
		self.boards = []
		r = requests.get(self.u_url)
		data = r.text
		soup = BeautifulSoup(data)
		titles = []
		urls = []
		pics = []

		for link in soup.find_all('img', 'boardCover'):
			if(link.get('alt')):
				title_s = link.get('alt')
				titles.append(filter(lambda x: x in string.printable, (title_s.split(' / '))[0]))  #error?
				pics.append(link.get('src')) # the url isn't in this tag, where is it?
				

		for link in soup.find_all('a', 'boardLinkWrapper'):
			urls.append(link.get('href'))

		for i in range(0, len(urls) - 1):
			self.boards.append(board(titles[i], urls[i], pics[i]))

		return
Пример #12
0
def get_details(html):
	soup=BeautifulSoup(html)
	#得到作者、作者链接、微博正文
	div_content=soup.find_all(attrs={'class': 'content clearfix'})
	#得到发微博时间
	div_time=soup.find_all(attrs={'class':'feed_from W_textb'})
	#将用户名称,用户主页地址、微博正文、发微博时间初始化
	nick_name=[]
	nickname_href=[]
	content_text=[]
	time=[]
	#print get_content[0]
	for i in range(len(div_content)):
		#查找a标签
		a_tag=div_content[i].find('a')
		nick_name.append(a_tag.get('nick-name'))
		nickname_href.append(a_tag.get('href'))
		#查找p标签
		p_tag=div_content[i].find('p')
		content_text.append(p_tag.get_text())
	#得到发微博时间
	for j in range(len(div_time)):
		a_time=div_time[j].find('a')
		time.append(a_time.get('title'))
	return (nick_name,nickname_href,content_text,time)
Пример #13
0
def get_item_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    print(url)#追踪出错url
    if soup.find_all(src=" http://sta.ganjistatic1.com/src/image/v5/expire.png"):#物品已售出,信息过期,如http://bj.ganji.com/shouji/1303939091x.htm
        print('信息已过期')
    else:
        if soup.select('div.error'):#错误界面
            pass
        else:
            if soup.select('h1.title-name'):#会出现此类界面,判断后跳过:http://bj.ganji.com/ershoubijibendiannao/386647282x.htm
                title = soup.select('h1.title-name')[0].text
                #title = soup.title.text
                time = soup.select('i.pr-5')[0].text.strip().split('发布')[0]
                type = soup.select('ul.det-infor > li > span > a')[0].text
                price = soup.select('i.f22.fc-orange.f-type')[-1].text if soup.find_all('i','f22 fc-orange f-type') else None
                area_list = soup.select('div.leftBox > div > div > ul > li')[9]
                area = ''.join(list(area_list.stripped_strings))
                seller = soup.select('span.fc-orange')[0].text
                if seller == '':
                    seller = '个人'
                else:
                    seller = '商家'
                item_info.insert_one({'title':title,'time':time,'type':type,'price':price,'area':area,'seller':seller})
                print({'title':title,'time':time,'type':type,'price':price,'area':area,'seller':seller})
            else:
                print('特殊页面')
Пример #14
0
def crawling(link, level):
	print "LEVEL: ",level 
	html = urllib.urlopen(link)
	soup = BeautifulSoup(html)

	for email in soup.find_all('a'):
		email=email.get('href')
		if email is not None:
			check=email.encode('ascii', 'ignore')
			if ( (re.search( r'mailto:\w+',check) ) or (re.search( r'\w+@\w+\.\w{2,6}\.?\w*',check) ) ):
				emailList.append(check)

	for url in soup.find_all('a'):
		#print url.get('href')
		url = url.get('href')
		if url is not None:
			url=url.encode('ascii', 'ignore')
			if ( not(re.search( r'http[s]?:\/\/[w.-]*', url)) and (re.search( r'\/[w.-/]*', url)) ):
				if (link[-1:]== '/'):
					link = link[:1]
				url = link+url
			print url
			if ( (re.search( r'http:\/\/[\w.-]*', url ))):
				if (not(re.search(r'http:\/\/(google|\w+\.google|www\.youtube|goo\.gl|www\.facebook\.com|www\.twitter\.com|t\.co)[w.-]*', url))):
					if (level<1):
						if url not in urlChecked:
							urlChecked.append(url)
							crawling(url, level+1)
Пример #15
0
def get_long_url(url):
	num_1 = 0
	num_2 = 0
	try:
		r = requests.get('https:'+url,headers=HeaderData.get_header(),timeout=60)				
		s = BeautifulSoup(r.content,"lxml")
		ptag = s.find_all("p",attrs={"class":"view mb20"})			
		atag = s.find_all("a",attrs={"class":"careful"})
		
		for a in atag:
			uri = a.get('href')
			if 'qzbd' in uri:
				num_1 += 1
				if num_1 > 3:
					break
				else:
					get_long_url(uri)		
			else:
				f.write(uri+'\n')
				print uri
				
		for p in ptag:
			uri =  p.find('a').get('href')
			if 'qzbd' in uri:
				num_2 += 1
				if num_2 > 3:
					break
				else:
					get_long_url(uri)
			else:
				f.write(uri+'\n')	
				print uri
				
	except Exception,e:
		print e
Пример #16
0
    def extract(self, html):
        "Extract data from meta, link and title tags within the head tag."
        extracted = {}
        soup = BeautifulSoup(html, 'html.parser')
        # extract data from title tag
        title_tag = soup.find('title')
        if title_tag:
            extracted['titles'] = [title_tag.string]

        # extract data from meta tags
        for meta_tag in soup.find_all('meta'):
            if 'name' in meta_tag.attrs and 'content' in meta_tag.attrs:
                name = meta_tag['name']
                if name in self.meta_name_map:
                    name_dest = self.meta_name_map[name]
                    if name_dest not in extracted:
                        extracted[name_dest] = []
                    extracted[name_dest].append(meta_tag.attrs['content'])

        # extract data from link tags
        for link_tag in soup.find_all('link'):
            if 'rel' in link_tag.attrs:
                if ('canonical' in link_tag['rel'] or link_tag['rel'] == 'canonical') and 'href' in link_tag.attrs:
                    if 'urls' not in extracted:
                        extracted['urls'] = []
                    extracted['urls'].append(link_tag['href'])
                elif ('alternate' in link_tag['rel'] or link_tag['rel'] == 'alternate') and 'type' in link_tag.attrs and link_tag['type'] == "application/rss+xml" and 'href' in link_tag.attrs:
                     if 'feeds' not in extracted:
                         extracted['feeds'] = []
                     extracted['feeds'].append(link_tag['href'])
        return extracted
def parse_update_data(html):
    soup_outer = BeautifulSoup(html, "lxml", from_encoding="utf-8")
    for tr in soup_outer.find_all(name="tr")[1:]:
        td = tr.find_all(name="td")
        film = td[0].get_text().strip()
        developer = td[1].get_text().strip()
        dilution = td[2].get_text().strip()
        asa_iso = td[3].get_text().strip()
        a35mm = td[4].get_text().strip()
        a120 = td[5].get_text().strip()
        sheet = td[6].get_text().strip()
        temp = td[7].get_text().strip()
        try:
            notes_link = td[8].find(name="a").get("href")
            notes_link = "http://www.digitaltruth.com/"+notes_link
            notes_html = urllib2.urlopen(notes_link).read()
            soup_inner = BeautifulSoup(notes_html, "lxml", from_encoding="utf8")
            notes = soup_inner.find_all(name="tr")[1].find_all(name="td")[-1].get_text().strip()
        except AttributeError:
            notes = ""
        from myrobot.models import FilmSearch
        try:
            obj = FilmSearch.objects.get(Film=film, Developer=developer, Dilution=dilution,
                                         ASA_ISO=asa_iso, Temp=temp)
            obj.a35mm = a35mm
            obj.a120 = a120
            obj.sheet = sheet
            obj.Notes = notes
            obj.save()
        except FilmSearch.DoesNotExist:
            obj = FilmSearch(Film=film, Developer=developer, Dilution=dilution, ASA_ISO=asa_iso,
                             a35mm=a35mm, a120=a120, sheet=sheet, Notes=notes)
            obj.save()
def processticker(ticker, file_name, date_int, listview):
    base_url = "http://finance.yahoo.com/q/op"
    num_of_tries = 0
    payload = {"s": ticker, "date": date_int}
    r = requests.get(base_url, params=payload)
    data = r.text
    soup = BeautifulSoup(data, "lxml")
    option_list = []
    expiration_dictionary = {}

    while num_of_tries < 20:
        try:

            for pair in soup.find_all("option"):
                expiration_dictionary[pair.get_text()] = yahoo_url + pair["data-selectbox-link"]
            for n in soup.find_all("script"):
                option_list.append(n)
            raw_options_chain = str(option_list.pop(16))
            start_call_options = [a.start() for a in list(re.finditer("calls", raw_options_chain))]
            endoptions = [a.start() for a in list(re.finditer("_options", raw_options_chain))]
            raw_options_chain = raw_options_chain[start_call_options[0] - 2 : endoptions[0] - 2]
            options_json = json.loads(raw_options_chain)
            # Extract puts/calls as JSON objects.
            put_list = options_json["puts"]
            call_list = options_json["calls"]
            print(call_list)
            create_csv(call_list, put_list, file_name, listview)

        except IndexError:
            num_of_tries += 1
            continue
        break
Пример #19
0
def write_txt_category(category, list_category, in_path="../elife-articles/", out_path="../articles/"):
    "Go throught the list and use BS to get text content."
    for article in list_category:
        print in_path+article
        article_text = ""
        soup = BeautifulSoup(open(in_path+article), ["lxml", "xml"])
        abstract_tag = soup.find_all("abstract")
        body_tag = soup.find_all("body")
        for b in body_tag[0]:
            try:
                article_text += " " + b.p.text
            except:
                for i in b:
                    try:
                        article_text += " " + i.p.text
                    except:
                        article_text += " " + i.string
        if abstract_tag:
            for abstract in abstract_tag:
                abstract_text = abstract.p.text
                article_text += "\n" + abstract_text

        for a in article_text:
            if a in spe_char.keys():
                final_article = article_text.replace(a, unicode(spe_char.get(a)))

        if not category in listdir(out_path):
            mkdir(out_path+category)
            print category, " folder created!"
        file_path = out_path + category + "/" + article.replace("xml", "txt")
        # Write to a text file:
        with codecs.open(file_path, mode='w', encoding='utf-8') as f:
            f.write(final_article)
    print "Wrote %d xml articles to text format." % len(list_category[:1])
Пример #20
0
def htmltitleParse():
    DATA={}
    count=0
    with open("./filestore/titleIndex_file_input.txt",'w') as f_write:
        for i in range(1,72140):
            try:
                with open("./filestore2/%d.txt" %i,'r') as f:
                    valueList=f.readlines()
                    DATA['url']=valueList[0].strip()
                    DATA['html']=valueList[2].strip()
                    title=''
                    #http://ubuntuforums.org/showthread.php?t=1215158
                    try:
                        html=BeautifulSoup(DATA['html'].strip())
                        if html.find_all('title'):
                            for sub in html.find_all('title'):
                                title=title+sub.get_text().encode('utf-8')+' '
                            title=tokenizeString(title)
                            #print title
                            f_write.write(DATA['url']+' '+title+'\n')
                    except IndexError:
                        pass
            except IOError:
                pass

    return
Пример #21
0
def trans_extra_data_fee(members): # need to judge there is extra usage
    owner_phone = "310-600-0358";
    data_plan = 20.0;
    usage_quota = data_plan * .1;
    soup_data = BeautifulSoup(open("billusage.htm"), "html.parser")
    extra_datausage = float(soup_data.find_all("div", {"class": "additionalColCenter"})[1].text.split('M')[0]) / 1024 # in GB
    total_datausage = data_plan + extra_datausage

    ths = soup_data.find_all("th", {"class": "PadTop0 BotSolidBorder borderRightSolid borderLeftSolid left", "headers": "header1"})
    extra_data_usage_dict = {};

    #print(ths[0].text[4:-1])
    real_extra_usage = total_datausage * float(ths[0].text[4:-1]) / 100 - usage_quota
    extra_data_usage_dict[owner_phone] = real_extra_usage if real_extra_usage >= 0 else 0

    total_extra_percent = extra_data_usage_dict[owner_phone]
    for i in range(1, 10):
        phone = ths[i].text[0:12]
        phone = trans_phone_format2(phone)
        #print(phone)
        real_extra_usage = total_datausage * float(ths[i].text[13:-1]) / 100 - usage_quota
        extra_data_usage_dict[phone] = real_extra_usage if real_extra_usage >= 0 else 0
        total_extra_percent += extra_data_usage_dict[phone]

    extra_fee_holder = soup_data.find_all("div", {"class": "additionalColRight"})
    extra_fee = float(extra_fee_holder[1].text[1:])

    #print('===========')
    for user, info in members.iteritems():
        members[user]['data'] = members[user]['data'] - extra_fee / 10 + extra_fee * extra_data_usage_dict[user] / total_extra_percent
        #print(user)
        #print(extra_fee * extra_data_usage_dict[user] / total_extra_percent)
        #print(members[user]['data'])
    return members;
Пример #22
0
def listar_lancamentos(url):
	codigo_fonte = abrir_url(url).result
	soup = BeautifulSoup(codigo_fonte)
	miniaturas = str(soup.find_all('div', class_='lancamentoBoxNome'))
	match = re.compile(r'<a href="(.+?)" style=".*?" title="(.+?)">').findall(miniaturas)
	img = re.compile(r'<img height=".*?" src="(.+?)" style=".*?" width=".*?">').findall(miniaturas)

	# Obter quantidade de paginas
	paginacao = str(soup.find_all('div', class_='paginacao'))
	match_pag = re.compile(r'<a href="(.+?)" title="(.+?)">').findall(paginacao)

	a = []
	for x in range(0, len(match)):
		temp = [match[x][0], match[x][1], img[x]]
		a.append(temp)

	total = len(a)
	for url2, titulo, img in a:
		addDir(titulo, url2, 4, img, True, total)

	try:
		n = re.search(r'http://www.superanimes.com/.+?\?&pagina=(.?)', url).group(1)
	except:
		url = url + '?&pagina=1'
		n = 1

	n = int(n)
	if n <= len(match_pag):
		m = n+1
		prox_pag = url.replace(str(n), str(m))
		addDir('Proxima Pagina >>>', prox_pag, 6, artfolder + 'destaques.png')
def page_loop(n):
    for i in range(1,n):
        print 'Now printing page',str(i)
        url = 'http://wanimal1983.tumblr.com/page/'+str(i)
        content = urllib2.urlopen(url)
        soup = BeautifulSoup(content,'lxml')

        girls_sets = soup.find_all('div',class_='photoset-grid')
        girls = soup.find_all('div',class_='media')
        # single pic
        for girl in girls:
            pic = girl.find('img')
            name=pic.get('alt')
            name=name.strip()
            link = pic.get('src')
            flink = link
            print name
            # print girl
            content2 = urllib2.urlopen(flink).read()
            with open('wanimal'+'/'+name+flink[-28:]+'.jpg','wb') as code:
                code.write(content2)
        # pic sets
        for girl_set in girls_sets:
            # print girl_set
            pic1 = girl_set.find_all('img')
            for pic2 in pic1:
                link1 = pic2.get('src')
                flink1 = link1
                print flink1
                content3 = urllib2.urlopen(flink1).read()
                with open('wanimal'+'/'+flink1[-28:],'wb') as code:
                    code.write(content3)
Пример #24
0
def pre_parse(filename, overwrite=False):
    with open(filename) as infile:
        soup = BeautifulSoup(infile, 'xml')

    for li in soup.find_all('li'):
        assert li.parent.name == 'ul'
        li.unwrap()
    for bold in soup.find_all('bold'):
        bold.name = u'b'
    for italic in soup.find_all('italic'):
        italic.name = u'i'

    for chapter in soup.find_all('chapter'):
        footnotes = chapter.find_all('ftnote')

        for footnote in footnotes:
            next_sibling = footnote.next_sibling
            if next_sibling is None:
                break

            while next_sibling.name != 'ftnote':
                footnote.append(next_sibling)
                if isinstance(next_sibling, Tag):
                    next_sibling.unwrap()
                next_sibling = footnote.next_sibling
                if next_sibling is None:
                    break

    if not overwrite:
        filename = filename.replace('.xml', '_copy.xml')
    with codecs.open(filename, 'w', 'utf-8') as outfile:
        outfile.write(unicode(soup))
def getAnswerer(question_id):
    # get html of the question page first
    # r = s.get('http://www.zhihu.com/question/22968659')
    r = s.get('http://www.zhihu.com/question/' + str(question_id))
    bs = BeautifulSoup(r.text)
    
    # question title
    a = bs.find_all('h2',{'class':'zm-item-title'})
    print a[0].text
    
    # data-aid,用于获取full voter info
    data_aid = re.findall('data-aid="(.*)"',r.text)
    
    # 用户id、用户名
    a = bs.find_all('h3',{'class':'zm-item-answer-author-wrap'})
    answerer_id = []
    answerer_name = []
    for i in range(len(a)):
        answerer_name.append(a[i].text.strip().split(u',')[0])
        if(answerer_name[i] != u'匿名用户'):
            answerer_id.append(re.findall('href="/people/(.*)"',str(a[i]))[0])
        else:
            answerer_id.append('anonymous')
            
    for i in range(len(answerer_name)):
        print "正在抓取ta的赞同者..\n"
        print answerer_name[i],data_aid[i]
        print '\n'
        Answer_Full_Vote_Info(question_id, answerer_name[i], data_aid[i])
Пример #26
0
def http(url):
	html = requests.get(url).text

	soup_main = BeautifulSoup(html)
	# "一个"的文字
	div = soup_main.find_all("div", {"class": "fp-one-cita"})
	text = div[0].a.text
	# print(text)

	# “一个”的图片地址
	img_list = soup_main.find_all("img", {"class": "fp-one-imagen"})
	imgUrl = img_list[0].get('src')
	# print(imgUrl)

	# "一个"的标题
	title_list = soup_main.find_all("p", {"class": "titulo"})
	title = str(title_list[0].text)
	print(title)



	# “一个”的文章vol.1132#articulo'
	url_stroy = 'http://wufazhuce.com/one/' + title + '#articulo'


	soup_stroy = BeautifulSoup(requests.get(url_stroy).text)
	stroy_content = str(soup_stroy.find("div", {"class": "articulo-contenido"}))

	stroy_title = str(soup_stroy.find("h2", {"class": "articulo-titulo"}))

	stroy = stroy_title + stroy_content

	for addr in to_addr:
		sendEmail(text, imgUrl, title, stroy, addr)
Пример #27
0
def structure_comments(verse):
    def clean_string(value):
        value = value.replace(u'\n', u' ')
        value = value.replace(u'\xb6', u'')
        value = value.replace(u'\u2283', u'')
        value = value.replace(u'\2282', u'')
        value = value.replace(u'\u0259', u'e')
        value = value.replace(u'[ ', u'[')
        value = value.replace(u' ]', u']')
        value = re.sub(u'G ?OD', u'God', value)
        value = re.sub(u' +', u' ', value)
        value = value.rstrip()
        value = value.lstrip()
        return value

    verse = re.sub(u'\[.*?\]', u'', verse, 1)
    soup = BeautifulSoup(u'<root>{}</root>'.format(verse), 'xml')
    # destroy empty b tags
    for element in soup.find_all(lambda x: x.name == 'b' and (x.is_empty_element or x.text.isspace())):
        element.decompose()
    for element in soup.find_all('xref'):
        element.decompose()
    for element in soup.find_all('small'):
        element.unwrap()
    for element in soup.find_all('sup'):
        element.unwrap()

    verse = clean_string(u' '.join(unicode(child) for child in soup.root.children))
    comments, start_index = [], 0
    for match in re.finditer(ur'\. <b>', verse):
        comments.append(verse[start_index:match.start()+1])
        start_index = match.start() + 2
Пример #28
0
def listar_animes2(url):
	codigo_fonte = abrir_url(url).result
	soup = BeautifulSoup(codigo_fonte)
	miniaturas = str(soup.find_all('div', class_='epsBoxImg'))
	match = re.compile(r'<a href="(.+?)" title="(.+?)">').findall(miniaturas)
	img = re.compile(r'<img alt=".+?" src="(.+?)" title=".+?"/>').findall(miniaturas)
	
	# Obter quantidade de paginas
	paginacao = str(soup.find_all('div', class_='paginacao'))
	match_pag = re.compile(r'<a href="(.+?)" title="(.+?)">').findall(paginacao)

	a = []
	for x in range(0, len(match)):
		temp = [match[x][0], match[x][1], img[x]]
		a.append(temp)

	total = len(a)
	for url2, titulo, img in a:
		if titulo.endswith("Online"):
		    addDir(titulo[0:len(titulo) - 6], url2, 5, img, False, total)
		else:
		    addDir(titulo, url2, 5, img, False, total)

	try:
		n = re.search(r'http://www.superanimes.com/.+?\?&pagina=(.?)', url).group(1)
	except:
		url = url + '?&pagina=1'
		n = 1

	n = int(n)
	if n <= len(match_pag):
		m = n+1
		prox_pag = url.replace(str(n), str(m))
		addDir('Proxima Pagina >>>', prox_pag, 4, artfolder + 'destaques.png')
Пример #29
0
def get_weekly_menu():
    r = requests.post(URL, data=payload)
    soup = BeautifulSoup(r.text)
    titles = soup.find_all("div", class_="title")
    menu_table = soup.find_all("table", class_="menu_table")
    children = list(titles[0].children)

    def clean(t):
        return t.strip(" -")

    structure = odict([("restaurant", clean(children[0].text)), ("period", clean(children[1])), ("menu", odict())])

    menu = structure.get("menu")
    for tr in menu_table[0].find_all("tr"):
        for td in tr.find_all("td"):
            classes = td.attrs.get("class")
            if classes is None:
                continue
            if "day" in classes:
                day_name = td.h4.text
                day = []
                menu[day_name] = day
            if "meal" in classes:
                meal = {"name": clean(td.text)}
                day.append(meal)
            if "price" in classes:
                price = td.text
                if price and len(price):
                    meal["price"] = price + "€"
                else:
                    meal["price"] = None
    return structure
Пример #30
0
def headliner(url):
	soup = BeautifulSoup((open(url)), "lxml")
	head1 = soup.find_all(['h1','h2','h3'])
	body = soup.find_all('p')
	
	

	head1_fixed = str(head1)
	soup1 = BeautifulSoup(head1_fixed, 'lxml')
	gold = soup1.text.decode("unicode-escape").encode("utf-8")

	body_fixed = str(body)
	soup_gold = BeautifulSoup(body_fixed, 'lxml')
	gold_body = soup_gold.text.decode("unicode-escape").encode("utf-8")

	print gold
	print ""
	print gold_body
	
	#print gold[0].get_text()
	
	#print head1[1].get_text()
	#print head2[2].get_text()
	
	
	#print head2
	
	#print head3
	
	print ""
Пример #31
0
        soupBATTING = BeautifulSoup(pageBATTING.text, 'html.parser')

        #Arguments that will be passed into our .find and .find_all
        attrs = {'attribute1Name': 'attribute1Value', 'attribute2Name': 'attribute2Value'}

        #Takes the five column headers. Only need to find it once
        #since it appears five times on the ESPN page.
        headers = soupBATTING.find('tr', attrs={'class': 'colhead'})

        columns = [col.get_text() for col in headers.find_all('td')]

        #Using pandas to create an empty data frame to store player stats
        final_df = pd.DataFrame(columns=columns)

        #Use re's compile function to scrape all player data
        players = soupBATTING.find_all('tr', attrs={'class':re.compile('row player-10-')})

        for player in players:
            #Gets player's stats
            stats = [stat.get_text() for stat in player.find_all('td')]

            #Temporary dataframe for a single player's stats
            temp_df = pd.DataFrame(stats).transpose()
            temp_df.columns = columns

            #Put player stats in the final dataframe
            final_df = pd.concat([final_df, temp_df], ignore_index=True)
        print(final_df)

        # Export to csv file displaying all 331 batters and their stats
        final_df.to_csv(r"C:\Users\Zach Patrignani\Desktop\mlb_stats.csv", index = False,
Пример #32
0
    res = ss.post(itesturl, headers=headers, data=data)
    resJ = json.loads(res.text)

if resJ['code'] == 9 or resJ['code'] == 10:
    print('用户名或密码或验证码错误. 程序退出.')
    os.system("pause")
    os._exit(0)

# 获取当前的测试列表
res_class = ss.get(classurl)

# 使用soup解析网页
bs = BeautifulSoup(res_class.text, 'lxml')

# 提取h2标签
lst = bs.find_all('h2')

print('当前任务: \n')

# 该列表记录着所有的作业列表以及可用状态
# 每行包含三个元素,分别是: id, 是否可以查看答案, 以及是否正在考试, 科目名
# 没错,考试可以多个同时进行
tasklist = []

# 编号
no = 0

# 对于每个考试,输出信息
for each in lst:
    par = each.parent
    dd = par.find_all('dd')
Пример #33
0
# beautiful soup wikipedia
# 위키피디아 예제 무작정 따라하기
# Code example copy&paste

#!/usr/bin/env python3
# Anchor extraction from HTML document

# import library
from bs4 import BeautifulSoup
from urllib.request import urlopen

# urlopen : 페이지를 열어준다, response라는 곳으로 담겠다
with urlopen('https://en.wikipedia.org/wiki/Main_Page') as response:
    soup = BeautifulSoup(response, 'html.parser')
    for anchor in soup.find_all('a'):
        print(anchor.get('href', '/'))

# with ~ as ~ : 파이썬 문법
# with as 구문은 다음과 같이 직관적으로 쓸 수도 있다
# responose = urlopen('https://en.wikipedia.org/wiki/Main_Page') 
# soup = BeautifulSoup(response, 'html.parser')
# for anchor in soup.find_all('a'):
#     print(anchor.get('href', '/'))

# BeautifulSoup() 함수를 사용, response를 넣어주고 html.parser를 이용하여 구문을 분석한 것을
# soup 이라는 변수에 담아준다

# for문을 사용하여 soup 안에 있는 'a' Tag를 찾아서
# anchor 라는 변수에 넣는다
# for문을 통해 하나씩 가져온 anchor 안에 a 태그의 'href' 참조 주소를 가져와서 print 
Пример #34
0
# 태그 선택자 이용해 한번에 가져오기
html = """
<html><body>
    <ul>
        <li><a href="http://www.naver.com">naver</a></li>
        <li><a href="http://www.daum.net">daum</a></li>
        <li><a href="http://www.daum.com">daum</a></li>
        <li><a href="http://www.google.com">google</a></li>
        <li><a href="http://www.tistory.com">tistory</a></li>
    </ul>
</body></html>
"""

soup = BeautifulSoup(html, 'html.parser')

links = soup.find_all("a")
print('links', type(links))
# links <class 'bs4.element.ResultSet'>
a = soup.find_all("a", string="daum")
print('a', a)
# a [<a href="http://www.daum.net">daum</a>, <a href="http://www.daum.com">daum</a>]
b = soup.find("a")
print('b', b)
# b <a href="http://www.naver.com">naver</a> 가장 상위 하나 가져옴.
c = soup.find_all("a", limit=3)
print('c', c)
# c [<a href="http://www.naver.com">naver</a>, <a href="http://www.daum.net">daum</a>, <a href="http://www.daum.com">daum</a>]
d = soup.find_all(string=["naver", "google"])
print('d', d)
# d ['naver', 'google'] 해당 내용을 찾아옴. but 보통 이렇게 쓰지 않음.
print('d', type(d))
Пример #35
0
def daily_task():
    global DATE
    DATE = str(datetime.date.today())
    browser = webdriver.Chrome(executable_path=CHROME_DRIVER,
                               chrome_options=OPTIONS)
    # browser = webdriver.Chrome()
    browser.set_window_position(400, 40)
    browser.set_window_size(1300, 1024)
    browser.get(BASE_URL)
    soup = BeautifulSoup(browser.page_source, 'lxml')
    urls = []
    main_category_list = soup.find('ul', id='main-smart-menu').find_all(
        'li', class_='menu-item')
    write_html(browser.page_source, "All_cat_")
    k = 1
    for main_item in main_category_list:
        if k >= 12:
            break
        href = BASE_URL + main_item.find('a').get('href')
        browser.get(href)
        soup = BeautifulSoup(browser.page_source, 'lxml')
        category_list = soup.find('ul', class_='cat-tree-nav').find_all(
            'li', class_='cat-tree-item')
        for item in category_list:
            url = BASE_URL + item.find('a').get('href')
            urls.append(url)
        k += 1


# cat-tree-nav
    j = 0
    while j < len(urls):
        print('Scraping', urls[j])
        browser.get(urls[j])
        soup = BeautifulSoup(browser.page_source, 'lxml')

        category_titles = soup.find('ol', class_='breadcrum')
        if category_titles is None:
            category = None
            sub_category = None
        else:
            category_titles = category_titles.find_all('li',
                                                       class_='breadcrum-item')
            if len(category_titles) == 2:
                category = category_titles[1].find('a').text.strip()
                sub_category = None
            if len(category_titles) == 3:
                category = category_titles[1].find('a').text.strip()
                sub_category = category_titles[2].find('a').text.strip()
            if len(category_titles) == 4:
                category = category_titles[1].find('a').text.strip()
                sub_category = category_titles[2].find('a').text.strip()

        # print(page_count)

        i = 0
        local_title = 5
        while i < int(local_title):
            soup = BeautifulSoup(browser.page_source, 'lxml')
            if i != 0:
                browser.get(urls[j] + "?p=" + str(i + 1))
                soup = BeautifulSoup(browser.page_source, 'lxml')
                list = soup.find_all('div', class_='product-catalog-item')
            if i == 0:
                soup = BeautifulSoup(browser.page_source, 'lxml')
                list = soup.find_all('div', class_='product-catalog-item')
            # print(len(list))
            # print(i+1)
            for item in list:
                item_id = item.get('data-pid').strip()
                # item_id = item_id.split('id=')[1]
                # Vietnamese
                # English
                if item.find('div', class_='name-brand') != None:
                    brand = item.find('div', class_='name-brand').text.strip()
                else:
                    brand = None
                if item.find('div', class_='home-product-title') != None:
                    title_Vietnamese = item.find(
                        'div', class_='home-product-title').text.strip()
                else:
                    title_Vietnamese = None
                # if item.find('div', class_='english_name') != None:
                #     title_English = item.find('div', class_='english_name').text.strip()
                # else:
                #     title_English = None
                # print("Title: " + title)
                if item.find('span', class_='list-product-meta-price') != None:
                    price = item.find(
                        'span', class_='list-product-meta-price').text.strip()
                    price = price.split('đ')[0]
                    price = price.strip()
                else:
                    price = None
                # print("Price: " + str(price))
                if item.find('span', class_='list-product-old-price') != None:
                    old_price = item.find(
                        'span', class_='list-product-old-price').text.strip()
                    old_price = old_price.split('đ')[0]
                    old_price = old_price.strip()
                else:
                    old_price = None

                date = DATE

                data = {
                    'category': category,
                    'sub_category': sub_category,
                    'id': item_id,
                    'good_name': title_Vietnamese,
                    'brand': brand,
                    'price': price,
                    'old_price': old_price,
                    'date': date
                }
                write_csv(data)
            file_name = str(j + 1) + "_" + str(i + 1) + "_"
            write_html(browser.page_source, file_name)
            soup = BeautifulSoup(browser.page_source, 'lxml')
            if soup.find('div', class_='list-pagination') == None:
                break
            page_count = soup.find('div',
                                   class_='list-pagination').find_all('a')
            local_title = browser.find_element_by_xpath(
                '//*[@id="_products"]/div/div[2]/div/a[' +
                str(len(page_count)) + ']')
            local_title = local_title.get_attribute('title')
            local_title = local_title.split('Xem trang')[1]
            local_title = local_title.strip()
            i += 1
        j += 1
    # Close browser
    browser.close()
    browser.service.process.send_signal(signal.SIGTERM)
    browser.quit()
    compress_data()
Пример #36
0
def queryListRequester():
#First few conditionals check or make the proper files needed for operation.
    searchSetResults = []
    if queryListCheck() == False:
        print("\n    No queries.pickle file found in the CWD.\n    Please create one with the query list editor.\n")
        #mainMenu(False)
    else:
        with open("queries.pickle", "rb") as fp:
            queries = pickle.load(fp)
#If there is no rotating search terms list set up yet, this will make and save the empty list of lists needed to start it.
    #A list of lists is needed so each list can correspond to a query.
    if not rotatingPageImagesCheck():
        x = []
        for i in range(len(queries)):
            x.append([])
        with open("query-results-rotating-list.pickle", "wb") as fp:
            pickle.dump(x, fp)
    with open("query-results-rotating-list.pickle", "rb") as fp:
        savedResultsRotation = pickle.load(fp)
#This is very important, will delete rotating search result images if there is a difference in length between the rotating
    #list and the query length, the only implimented way to tell if the query list has changed.
    if len(savedResultsRotation) != len(queries):
        os.remove("query-results-rotating-list.pickle")
        queryListRequester()
    print("\n    Connecting, parsing and pickling...\n")
    for i in range(len(queries)):
#Based on whether this program was called by cron or not, each search will be staggered by between 0 seconds and 7 seconds
    #as to make it more difficult to detect a regular pattern of search, preventing detection and blocking of the bot.
#The google search is created with each query being placed inside.
        google_url = "https://www.google.com/search?q={" + queries[i] + "}&num=lnms"
#Try block used in case of connection failure, depending on whether the program is being called by cron or not, it will
    #fail out to the menu or not.
        tryCount = 0
        response = None
        while response is None:
            try:
#The response to the search query from google is taken. The custom user agents are used so the website doesnt realize the
    #itsbeing scraped.
                response = requests.get(google_url, {"User-Agent": userAgents[random.randint(0,(len(userAgents)-1))]})
            except:
                if tryCount == 5:
                    print("\nConnection failed.\n")
                    #else:
                        #mainMenu(False)
                #print("\n    Connection issues...")
                #tryCount += 1
                pass
#Beautiful soup gets the html from the response.
        html = BeautifulSoup(response.text, "html.parser")
        result_div = html.find_all('div', attrs={'class': 'ZINbbc'})
        links = []
        titles = []
        descriptions = []
#Parses the html of google search page into lists of the links, titles, and descriptions of the query search results.
        for r in result_div:
#Checks if each element is present, else, raise exception.
            try:
                link = r.find('a', href=True)
                title = r.find('div', attrs={'class': 'vvjwJb'}).get_text()
                description = r.find('div', attrs={'class': 's3v9rd'}).get_text()
#Makes sure everything is present before appending.
                if link != '' and title != '' and description != '':
                    links.append(link['href'])
                    titles.append(title)
                    descriptions.append(description)
#Goes to next loop if one element is not present.
            except:
                continue
        savedResultsRotation[i].append(titles)
        searchResults = [links, titles, descriptions]
        searchSetResults.append(searchResults)
    with open("query-results.pickle", "wb") as fp:
        pickle.dump(searchSetResults, fp)
#This shows the rotating lifo search list's rotation schedule. Here we have the line "if len(savedResultsRotation[0]) == 8:"
    #Each time this function is called a new set of titles from the searches is added to this list. Ideally, when comparing
    #our new search against our old ones, we will be comparing against the search results of the previous 8 searches.
    #It is important to rotate our search results like this, because many search results may go in and out of the search
    #pages remaining a popular topic, we want to know about results that are new, so we compare against lots of old.
    if len(savedResultsRotation[0]) == 8:
        for i in range(len(savedResultsRotation)):
#Deletes oldest list of search results once 8 search results is met.
            savedResultsRotation[i].pop(0)
    with open("query-results-rotating-list.pickle", "wb") as fp:
        pickle.dump(savedResultsRotation, fp)
    #if mainMenuAfter == True:
        #mainMenu(False)

    print("done")
    return searchSetResults
# Il y a 42 pages de communiqués, et la première page est la page 0. Ainsi, j'ai créé une liste de nombre allant de 0 à 41. C'est pour cette raison qu'elle se finit par 42. 

pages = list(range(0,42))

# Création de ma première boucle. Ici je viens créer un lien pour chaque page consultée. 

for page in pages:

    urlpage = url + str(page) 
    # print(urlpage)

    # Création de requêtes pour aller chercher les 42 pages. 

    sites = requests.get(urlpage, headers=entetes)
    pages2 = BeautifulSoup(sites.text, "html5lib")
    articles = pages2.find_all("li", class_="search-result")
    
    for article in articles:
        # Je vais chercher la date pour aller l'inclure dans mon fichier CSV.
        date = article.find("span", class_="search-result-date").text.strip()
        # Je vais également inclure le titre du communiqué. 
        titrecommunique = article.find("a", class_="search-result-title").text.strip()
        listesujets = []
        listesujets.append(date)
        listesujets.append(titrecommunique)
        # Parce que les urls que l'on retrouve dans le code source ne sont que la fin de l'URL requis, j'ai pris la première partie de l'URL pour par la suite inclure la partie retrouvée sur le code. 
        urldebut = "https://lautorite.qc.ca"
        urlfin = article.find("a", class_="search-result-title")["href"]
        urlfinal = urldebut + urlfin
        # Ajout de l'URL complet dans ma liste. 
        listesujets.append(urlfinal)
Пример #38
0
def get_github_info(url="", title="", ts="", tag="",
                    max_redirects=30, proxy=None, root_dir="data/githubcom", isnew=False,
                    retry=3, timeout=10):
    """
    github解析
    :param url:
    :param max_redirects:
    :param proxy:
    :param root_dir:
    :return:
    """
    file_404 = path("data/github_404")
    urls_404 = set()
    if os.path.exists(file_404):

        with codecs.open(file_404, mode='rb') as fr:
            for line in fr:
                line = line.strip()
                if line:
                    urls_404.add(line)

    pattern = "(https://github.com/([^/]+))"
    match = re.search(pattern, url)
    overview = {}
    overview['title'] = strip_n(title)
    overview["url"] = url
    overview['ts'] = ts
    overview['tag'] = tag
    if match:
        url_root, github_id = match.groups()

        overview["github_id"] = github_id

        if url_root in urls_404:
            return
    else:
        return

    root_dir = path(root_dir)
    if not os.path.exists(root_dir):
        os.mkdir(root_dir)

    hl = hashlib.md5()
    hl.update(url.encode(encoding='utf-8'))

    fname = path(root_dir, "%s.html" % hl.hexdigest())

    if isnew or not os.path.exists(fname):
        get_request(url_root, proxy=proxy, fname=fname, fname_404=file_404, retry=retry,
                    timeout=timeout, max_redirects=max_redirects)

    if os.path.exists(fname):
        with codecs.open(fname, mode='rb') as fr:
            try:
                soup = BeautifulSoup(fr, 'lxml')
            except Exception as e:
                logging.error("GET title of %s failed : %s" % (url, repr(e)))
                return

            # 1. find org-description
            org_sub = soup.find("p", class_='org-description')
            if org_sub:
                org_sub = org_sub.next_sibling
                if org_sub:
                    org_sub = org_sub.get_text()
                    org_sub = strip_n(org_sub)

            overview["org_profile"] = org_sub

            # 2. find geo and url

            org_meta = soup.find("ul", class_=re.compile("org-header-meta"))
            org_url = None
            org_geo = None
            if org_meta:
                org_url = org_meta.find("a")
                if org_url:
                    org_url = strip_n(org_url.get("href"))

                org_geo = org_meta.find("li", class_=re.compile('meta-item v-align-middle'))
                if org_geo:
                    org_geo = strip_n(org_geo.get_text())

            overview["org_url"] = org_url
            overview["org_geo"] = org_geo

            # 3.  repos#people#project
            for aa in soup.find_all("a", class_=re.compile(r'pagehead-tabs-item')):
                aa = aa.get_text()
                aa = strip_n(aa)
                if aa:
                    parts = re.split("\s+", aa)

                    if len(parts) == 2:
                        t = re.sub(',', '.', parts[1])

                        p = re.match('(\d+\.*\d*)([km]*)', t)
                        if p:
                            n, d = p.groups()

                            if d == 'k':
                                t = float(n) * 1000
                            elif d == 'm':
                                t = float(n) * 1000000
                            else:
                                pos = n.find('.')
                                if pos != -1:
                                    t = int(float(n) * pow(10, len(n[n.find('.') + 1:])))

                        overview["org_%s" % parts[0].lower()] = t

            # 4. star forks
            overview["repo_star"] = 0
            overview["repo_forks"] = 0
            repo_language = set()

            # repo
            for aa in soup.find_all('span', class_=re.compile('repo-language-color')):
                aa_p = aa.parent
                if aa_p:
                    aa_p = strip_n(aa_p.get_text())
                    if aa_p:
                        p = re.split(r'\s+', aa_p)
                        if len(p) > 0:
                            repo_language.add(p[0])
            if repo_language:
                repo_language = ",".join(repo_language)
            else:
                repo_language = ""

            overview["repo_lang"] = repo_language

            for aa in soup.find_all("a", class_="pinned-item-meta muted-link"):

                t = strip_n(aa.get_text())

                if re.match('^\d+$', t):
                    t = int(t)
                else:
                    t = re.sub(',', '.', t)
                    p = re.match('(\d+\.*\d*)([km])', t)
                    if p:
                        n, d = p.groups()

                        if d == 'k':
                            t = float(n) * 1000
                        elif d == 'm':
                            t = float(n) * 1000000
                        else:
                            t = int(n)
                    else:
                        continue

                star = aa.find("svg", class_="octicon octicon-star")
                if star:
                    if t > overview["repo_star"]:
                        overview["repo_star"] = t
                else:
                    forks = aa.find("svg", class_="octicon octicon-repo-forked")
                    if forks:
                        if t > overview["repo_forks"]:
                            overview["repo_forks"] = t

            # 5. languages
            overview["org_lang"] = []
            for aa in soup.find_all("a", class_="no-wrap text-gray d-inline-block muted-link mt-2"):
                t = strip_n(aa.get_text())
                overview["org_lang"].append(t)

            overview["org_lang"] = ",".join(overview["org_lang"])

            overview["github_type"] = 1  # 1: org 0:private
            if not (org_sub or org_geo or org_url):
                overview["github_type"] = 0

                # repos #stars #followers#following
                for aa in soup.find_all("a", class_=re.compile('UnderlineNav-item')):
                    aa = aa.get_text()
                    aa = strip_n(aa)
                    if aa:
                        parts = re.split("\s+", aa)

                        if len(parts) == 2:
                            t = re.sub(',', '.', parts[1])
                            p = re.match('(\d+\.*\d*)([km]*)', t)
                            if p:
                                n, d = p.groups()

                                if d == 'k':
                                    t = int(float(n) * 1000)
                                elif d == 'm':
                                    t = int(float(n) * 1000000)
                                else:
                                    t = int(n)
                            overview["p_%s" % parts[0].lower()] = t

                # 个人简介
                p_profile = soup.find("div", class_=re.compile("p-note user-profile-bio"))
                if p_profile:
                    p_profile = strip_n(p_profile.get_text())

                overview["p_profile"] = p_profile

                # 公司
                p_company = soup.find("span", class_=re.compile("p-org"))
                if p_company:
                    p_company = strip_n(p_company.get_text())

                overview["p_company"] = p_company

                # 地理位置
                p_loc = soup.find("span", class_=re.compile("p-label"))
                if p_loc:
                    p_loc = strip_n(p_loc.get_text())

                overview["p_loc"] = p_loc

                # url
                p_url = soup.find("div", class_=re.compile('js-profile-editable-area'))

                if p_url:
                    p_url = p_url.find_all("a")

                    if p_url:
                        for p_url_i in p_url:
                            p_url_i = p_url_i.get("href")
                            if p_url_i.startswith("http"):
                                p_url = p_url_i
                                break
                            else:
                                p_url = ""

                if not p_url:
                    p_url = None

                overview["p_url"] = strip_n(p_url)

                # organizations
                github_org = soup.find("a", class_='avatar-group-item')
                if github_org:
                    github_org = github_org.get("href")

                overview["p_github_org"] = strip_n(github_org)

    return overview
Пример #39
0
def scrape():
    browser = init_browser()
    listings = {}

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(3)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    result = soup.find('li', class_='slide')
    # news_title = result.find(class_='content_title').text

    listings["news_title"] = result.find(class_='content_title').get_text()
    listings["news_description"] = result.find(class_= 'rollover_description_inner').get_text()
    

    image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/'
    browser.visit(image_url)
    time.sleep(3)
    image_html = browser.html
    soup = BeautifulSoup(image_html, 'html.parser')
    carousel = soup.find(class_='carousel_items')
    listings["featured_image_url"] = carousel.find(class_= 'button fancybox')['data-fancybox-href']
    
    weather_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(weather_url)
    time.sleep(3)
    weather_html = browser.html
    soup = BeautifulSoup(weather_html, 'html.parser')
    content = soup.find(class_='content')
    listings["mars_weather"] = content.find('p', class_ = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text
    
    
  
    mars_facts_url = 'https://space-facts.com/mars/'
    mars_facts = {}
    mars_facts = pd.read_html(mars_facts_url)[1]
    mars_facts_html = mars_facts.to_html()
    listings["mars_facts"] = mars_facts_html
  
    # # Mars Hemispheres
    hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemisphere_url)
    time.sleep(3)
    hemisphere_html = browser.html
    soup = BeautifulSoup(hemisphere_html, 'html.parser')
    image_links = soup.find_all('div', class_='description')
    
    full_urls = []
    full_links = []
    for image in image_links:
        image_url = image.find('a')['href']
        full_url = f'https://astrogeology.usgs.gov{image_url}'
        full_urls.append(full_url)
        browser.visit(full_url)
        time.sleep(3)
        image_html = browser.html
        soup = BeautifulSoup(image_html, 'html.parser')
        full_link = soup.find('img', class_='wide-image')['src']
        full_links.append(f'https://astrogeology.usgs.gov{full_link}')
        
    listings["full_url"] = full_urls
    listings["full_image_link"] = full_links
    print(listings)

    return listings
Пример #40
0
import requests
from bs4 import BeautifulSoup
from csv import writer

source = requests.get("https://www.hackerearth.com/companies/")

soup = BeautifulSoup(source.text, "html.parser")

el = soup.find_all(class_='company-card-container')

#csv writer

with open(
        '/home/chiraghs/my_codes/Web_scraping/Company_names with links/names.csv',
        'w') as f:
    elem = writer(f)
    headeres = ['Title', 'Location ', 'Link']
    elem.writerow(headeres)

    #get items or data
    for item in el:
        # print(item)
        print("\n")
        title = item.find(class_='name ellipsis').get_text().replace('\n', "")
        link = item['link']
        print(title + " : " + link)
        elem.writerow([title, link])
Пример #41
0
def get_text(url):
    page = urlopen(url)
    soup = BeautifulSoup(page)
    fetched_text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return fetched_text
Пример #42
0
    def run(self):
        if self.UI.username.text() == '':
            self.UI.log.append('لطفا ایمیل خود را وارد کنید')
            return
        if self.UI.password.text() == '':
            self.UI.log.append('لطفا پسورد خود را وارد کنید')
            return

        self.UI.log.append('شروع')

        def dkprice_to_numbers(dkprice):
            '''gets something like ۱۱۷،۰۰۰ تومان and returns 117000'''
            convert_dict = {u'۱': '1', u'۲': '2', u'۳': '3', u'۴': '4', u'۵': '5',
                            u'۶': '6', u'۷': '7', u'۸': '8', u'۹': '9', u'۰': '0', }
            price = u'۰' + dkprice
            for k in convert_dict.keys():
                price = re.sub(k, convert_dict[k], price)

            price = re.sub('[^0-9]', '', price)
            return int(price)

        def extract_data(one_page, all_orders, all_post_prices):
            soup = BeautifulSoup(one_page.text, 'html.parser')
            # there might be more than one table
            for this_table in soup.find_all('div', class_='c-table-order__body'):
                for this_item in this_table.find_all('div', class_='c-table-order__row'):
                    name = this_item.find('span').get_text()
                    dknum = this_item.find(
                        'div', class_='c-table-order__cell--value').get_text()
                    num = dkprice_to_numbers(dknum)
                    dkprice = this_item.find(
                        'div', class_='c-table-order__cell--price-value').get_text()
                    price = dkprice_to_numbers(dkprice)
                    dkdiscount = this_item.find(
                        'div', class_='c-table-order__cell c-table-order__cell--discount').get_text()
                    discount = dkprice_to_numbers(dkdiscount)
                    date = soup.find('h4').span.get_text()
                    date = re.sub(u'ثبت شده در تاریخ ', '', date)
                    all_orders.append((date, name, num, price, discount))

            dkpost_price = soup.find_all('div', class_='c-table-draught__col')[3].get_text()
            post_price = dkprice_to_numbers(dkpost_price)
            all_post_prices.append(post_price)

        self.UI.log.append('تلاش برای ورود')
        url = 'https://www.digikala.com/users/login/'
        payload = {'login[email_phone]': self.UI.username.text(),
                   'login[password]': self.UI.password.text(), 'remember': 1}
        session = requests.session()
        r = session.post(url, data=payload)
        if r.status_code != 200:
            self.UI.log.append('مشکل در اتصال. کد خطا: %s' % r.status_code)
            return 

        successful_login_text = 'سفارش‌های من'
        if re.search(successful_login_text, r.text):
            self.UI.log.append('لاگین موفق')
        else:
            self.UI.log.append('کلمه عبور یا نام کاربری اشتباه است')
            return 

        page_number = 1
        orders = session.get(
            'https://www.digikala.com/profile/orders/?page=%i' % page_number)
        soup = BeautifulSoup(orders.text, 'html.parser')

        all_orders = []  # (list of (date, name, number, item_price))
        all_post_prices = []  # list of post prices

        while not soup.find('div', class_='c-profile-empty'):
            for this_order in soup.find_all('a', class_='btn-order-more'):
                this_order_link = this_order.get('href')
                print('going to fetch: http://digikala.com' + this_order_link)
                one_page = session.get('http://digikala.com' + this_order_link)
                extract_data(one_page, all_orders, all_post_prices)
            self.UI.log.append('بررسی صفحه %i' % page_number)
            page_number += 1
            orders = session.get(
                'https://www.digikala.com/profile/orders/?page=%i' % page_number)
            soup = BeautifulSoup(orders.text, 'html.parser')

        self.UI.log.append('پایان')

        total_price = 0
        total_purchase = 0
        full_purchase_list = ''
        n = 0
        total_post_price = 0
        total_discount = 0
        self.UI.output_general.setRowCount(len(all_orders))

        for date, name, num, price, discount in all_orders:
            this_purchase_str = "تاریخ %s:‌ %s عدد %s, به قیمت هر واحد %s\n" % (
                date, num, name, price)
            full_purchase_list = this_purchase_str + full_purchase_list
            this_product_total_price = (price * num) - discount
            total_price += this_product_total_price
            total_purchase += 1
            total_discount += discount

            self.UI.output_general.setItem(n, 0, QTableWidgetItem(str(date)))
            self.UI.output_general.setItem(n, 1, QTableWidgetItem(str(num)))
            self.UI.output_general.setItem(n, 2, QTableWidgetItem(str(this_product_total_price)))
            self.UI.output_general.setItem(n, 3, QTableWidgetItem(str(discount)))
            self.UI.output_general.setItem(n, 4, QTableWidgetItem(str(name)))
            n = n + 1
        purchase_count = len(all_post_prices)
        for post_price in all_post_prices:
            total_post_price += post_price

        self.UI.output_result.clear()
        price_item = ['کل خرید شما از دیجی کالا:    {} تومان'.format(total_price)]
        total_post_price_item = ['مجموع هزینه ی پست:          {} تومان'.format(total_post_price)]
        total_discount_item = ['مجموع تخفیفات دریافتی:     {} تومان'.format(total_discount)]
        purchase_item = ['تعداد خرید:    {} قطعه'.format(total_purchase)]
        purchase_count_item = ['دفعات خرید:    {} بار'.format(purchase_count)]

        self.UI.output_result.addItems(price_item)
        self.UI.output_result.addItems(total_post_price_item)
        self.UI.output_result.addItems(total_discount_item)
        self.UI.output_result.addItems(purchase_item)
        self.UI.output_result.addItems(purchase_count_item)
Пример #43
0
    #     doc_to_docx(file_path);

    if not os.path.isdir(file_path) and os.path.splitext(file_path)[1] == '.docx' \
       and file[0:2] != '~$' and file[0:2] != '.~':
        filepaths.append(file_path)
    # print(oldnames)

if not os.path.exists(path + 'result/'):
    os.mkdir(path + 'result/')
for filepath in filepaths:

    document = ZipFile(filepath)
    xml = document.read("word/document.xml")
    wordObj = BeautifulSoup(xml.decode("utf-8"))
    # 插入:w:ins,删除:w:del,移动 :w:moveFrom,w:moveTo,设置格式:w:rFonts,批注:w:commentRangeEnd
    paras = wordObj.find_all("w:p")

    oldfilename = filepath.split('/')[-1]
    first = oldfilename.split('_')[0]
    firstA = oldfilename.split('_')[0] + '_修改前后对照表.docx'

    # 存入的文档
    doc = docx.Document()
    # 标题
    p = doc.add_paragraph()
    p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run('《' + first + '》' + '\n' + '修订前后对照表')
    run.font.name = '宋体'
    run.font._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    run.font.size = docx.shared.Pt(14)
Пример #44
0
    minor_list = []

    data = soup.find('div', attrs={'class': 'bodyContent'})
    for a in data.find_all('a', href=True):
        link = (r'http://www.cushmanwakefield.us' + a['href'])
        minor_list.append(link)
    print(minor_list) #list of links to scrape by city

    for item in minor_list:
        browser2 = webdriver.Chrome()
        browser2.get(item)
        HTML2 = browser2.execute_script("return document.body.innerHTML")
        soup2 = BeautifulSoup(HTML2, 'html.parser')

        data = soup2.find('div', attrs={'class': 'm-box highlight lightGrey'})
        for link in soup2.find_all('a', href=True):
            href = link['href']
            if any(href.endswith(x) for x in ['.pdf']):
                print(href)
                file_name = href.split('/')[-1]
                print(file_name)
                print(file_name)
                remote_file = requests.get(href)
                os.makedirs(os.path.join(baseDir, query), exist_ok=True)
                with open(os.path.join(baseDir,query,file_name), 'wb') as f:
                    for chunk in remote_file.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                print('saved: ' + href)

Пример #45
0
__author__ = 'wilsonincs'
"""Display the price and date at closing for the apple stock"""

import urllib2
from bs4 import BeautifulSoup
import pprint as p

url = "http://finance.yahoo.com/q/hp?s=AAPL+Historical+Prices"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())

stock = soup.find_all('tr')

closing_data = []
for i in stock:
    if len(i.find_all('td', {
            'class': 'yfnc_tabledata1',
            'align': 'right'
    })) == 7:
        date = i.contents[0].get_text()
        close = i.contents[6].get_text()
        closing_data.append((date, close))

p.pprint(closing_data)
Пример #46
0
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup

r = requests.get("http://www.onlinedown.net/hits/windows/2/")
soup = BeautifulSoup(r.text, 'lxml')
# print(soup, type(soup))

print (len(soup.find_all("a", "title")))
for a in soup.find_all("a", "title"):
    print (a.text)

print (len(soup.find_all("span", "size")))
for a in soup.find_all("span", "size"):
    print(a.text)

print (len(soup.find_all("span", "lan")))
for a in soup.find_all("span", "lan"):
    print(a.text)

print (len(soup.find_all("span", "pop")))
for a in soup.find_all("span", "pop"):
    print(a.text)

print (len(soup.find_all("span", "dro")))
for a in soup.find_all("span", "dro"):
    print(a.text)

print (len(soup.find_all("span", "time")))
for a in soup.find_all("span", "time"):
Пример #47
0
 def run(self) :
     display = Display(visible=0 , size=(1024,768))
     display.start()
     browser = webdriver.Firefox()
     requests.adapters.DEFAULT_RETRIES = 5
     headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"}
     is_continue = True
     retries = 0
     page_html = ""
     while is_continue :
         response = None
         try :
             print(self.home_page)
             response = requests.get(self.home_page , timeout=10 , headers=headers)
             page_html = response.text
             if "" != page_html :
                 break
         except Exception as e :
             logger.error(str(e))
             retries += 1
             logger.error("Retry: " + str(retries))
             if 5 < retries :
                 break
             else :
                 time.sleep(5)
                 continue
     soup = BeautifulSoup(str(page_html) , "lxml")
     table_match = soup.find_all(name="tbody")
     table_soup = BeautifulSoup(str(table_match) , "lxml")
     items_match = table_soup.find_all(name="tr")
     proxy_info = []
     print(page_html)
     for item in items_match :
         """
         ip_rex = ">\s*(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*</span>"
         port_rex = ">\s*(\d{1,5})\s*</span>"
         used_time_rex = ">\s*(\d{1,3})天\s*</span>"
         ip_match = re.findall(re.compile(ip_rex) , str(item))
         port_match = re.findall(re.compile(port_rex) , str(item))
         used_time_match = re.findall(re.compile(used_time_rex) , str(item))
         if 1==len(ip_match) and 1==len(port_match) and 1==len(used_time_match) :
             proxy_info.append([ip_match[0] , port_match[0] , used_time_match[0]])
         else :
             logger.error("ip_match: " + str(ip_match))
             logger.error("port_match: " + str(port_match))
             logger.error("used_time_match: " + str(used_time_match))
         """
         ip_rex = ">\s*(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*</td>"
         port_rex = ">\s*(\d{1,5})\s*</td>"
         net_type_rex  = ">\s*([A-Z]{1,10})\s*</td>"
         response_time_rex = ">([0-9]{0,1}\.{0,1}[0-9]{0,1})秒</td>"
         ip_match = re.findall(re.compile(ip_rex) , str(item))
         port_match = re.findall(re.compile(port_rex) , str(item))
         net_type_match = re.findall(re.compile(net_type_rex) , str(item))
         response_time_match = re.findall(re.compile(response_time_rex) , str(item))
         print(ip_match)
         print(port_match)
         print(net_type_match)
         print(response_time_match)
     sys.exit(1)
     proxy_info = sorted(proxy_info , key=lambda info:int(info[2]) , reverse=True)
     check_target_url = [
             "https://www.baidu.com/" , 
             "http://www.jd.com/" , 
             "https://www.zhihu.com/" , 
             "http://www.bilibili.com/" , 
             "https://www.taobao.com/" , 
             ]
     urls_match_proxies = {}
     for url in check_target_url :
         url = "http://www.jd.com/"
         url_matches_proxies = []
         for pinfo in proxy_info :
             net_types = self.checkProxy(pinfo[0] , pinfo[1] , url)
             if 1 == len(net_types) :
                 url_matches_proxies.append([net_types[0] , pinfo[0] , pinfo[1]])
             elif 2 == len(net_types) :
                 url_matches_proxies.append([net_types[0] , net_types[1] , pinfo[0] , pinfo[1]])
             else :
                 logger.error("No proxies matched.")
                 continue
         urls_match_proxies[url] = url_matches_proxies
     for url,proxies in urls_match_proxies.items() :
         print(url)
         for p in proxies :
             print("\t\t" + str(p))
Пример #48
0
# https://www.sample.net/business/finance/invoice/
# https://www.freshbooks.com/invoice-templates/pdf

#マニュアルのPDFに入るための共通URL "manual/xxx" このXXXで取得するPDFが変わる
download_urls = []
BASE_URL = "https://sega.jp/mdmini/manual/"

#"tmp_folder"がないことを確認し、"tmp_folder"を作成する
if os.path.exists("tmp_folder") == False:
    os.mkdir("tmp_folder")

#サーバからHTML、XMLなどの情報を取得するのに使用
html = requests.get("https://sega.jp/mdmini/manual/index.html")

soup = BeautifulSoup(html.text, "lxml")  #”a”タグをすべて抽出し、linksというリストを作ります。
links = soup.find_all("a")

for link in links:
    h_ref = link.get("href")

    if h_ref and ".pdf" in h_ref:
        download_urls.append(h_ref)

for download_url in download_urls:

    file_name = download_url.split("/")[-1]
    r = requests.get(BASE_URL + download_url)
    time.sleep(1)

    if r.status_code == 200:
        with open(os.path.join("tmp_folder", file_name), "wb") as f:
Пример #49
0
class Scrawler(object):
    """docstring for  Scrawler"""
    def __init__(
        self,
        cookies=""
    ):
        self.session = requests.session()
        # self.session.keep_alive = False
        self.headers = {
            'Host': 'www.tianyancha.com',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Referer': 'https://www.tianyancha.com/',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'cookie': cookie,
        }
        self.soup = None
        self.url_id = None
        self.proxies = None
        self.set_proxy()


    scrawler_num = 0


    @classmethod
    def build_scrawler(cls, cookies):
        s = cls(cookies[cls.scrawler_num])
        cls.scrawler_num += 1
        return s


    def set_proxy(self):
        url = "http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=3873239366fb4548a227fcbf310862ba&count=1&expiryDate=0&format=1&newLine=2"
        resp = self.session.get(url)
        if resp.status_code == 200:
            resp_json = json.loads(resp.text)
            proxy_ip = resp_json['msg'][0]['ip']
            proxy_port = resp_json['msg'][0]['port']
            proxy_meta = "%(ip)s:%(port)s" % {
                "ip" : proxy_ip,
                "port" : proxy_port,
            }
            proxies = {
                "http"  : "http://" + proxy_meta,
                "https" : "https://" + proxy_meta,
            }
            self.proxies = proxies
            print("Connect: Set Proxy %s" % proxy_meta)
            print()
        else:
            raise Exception


    # def set_cookie(self):
    #     self.headers['cookie'] = self.cookies[Scrawler.scrawler_num]
    #     Scrawler.scrawler_num += 1
    #     if Scrawler.scrawler_num >= len(self.cookies):
    #         Scrawler.scrawler_num = 0

    def set_cookie(self, cookie):
        self.headers['cookie'] = cookie


    def get_current_ip(self):
        url = "https://httpbin.org/ip"
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
        }
        resp = requests.get(url, headers=headers, proxies=self.proxies, timeout=10)
        print("Connect: Current IP %s" % resp.text.strip())
        return resp


    def req_get(self, url):
        resp = requests.get(url, headers=self.headers, proxies=self.proxies, timeout=10)
        return resp


    def parse_urls(self, page_no):
        if page_no == 1:
            url = "https://www.tianyancha.com/search?key=%E7%A7%91%E6%8A%80%E9%87%91%E8%9E%8D"
        else:
            url = "https://www.tianyancha.com/search/p" + str(page_no) + "?key=%E7%A7%91%E6%8A%80%E9%87%91%E8%9E%8D"

        resp = self.session.get(url)

        urls_soup = BeautifulSoup(resp.content, "html5lib")
        urls = []
        url_list = urls_soup.find_all('div', attrs={'class': 'search_result_single'})

        for li in url_list:
            url = li.find('a')['href']
            urls.append(url)

        return urls


    def parse_url_content(self, url, url_id):
        print('Connect: GET %s' % url)
        resp = self.req_get(url)

        print('Parsing: Company Info')
        print()
        self.soup = BeautifulSoup(resp.content, "html5lib")
        self.url_id = url_id


    def parse_company_info(self):
        data = [['url_id', 'company_name', 'company_address', 'company_intro', 'company_status']]
        header = self.soup.find('div', attrs={'class': 'company_header_width'})
        company_name = header.h1.get_text()
        company_info = header.find_all('div', attrs={'class': ['f14', 'sec-c2']})
        company_address = company_info[1].contents[1].contents[1].get_text()
        company_intro = header.find('script', attrs={'id': 'company_base_info_detail'})
        if company_intro:
            company_intro = company_intro.get_text().strip()
        else:
            company_intro = '暂无信息'

        company_info2 = self.soup.find_all('div', attrs={'class': 'baseinfo-module-content-value'})
        if company_info2:
            company_status = company_info2[2].get_text()
        else:
            company_status = '暂无信息'

        data.append([self.url_id, company_name, company_address, company_intro, company_status])

        return data


    def parse_corporate_info(self):
        data = [['url_id', 'corporate_name', 'company_role', 'company_name', 'company_province', 'company_date', 'company_capital', 'company_status']]
        corporate_info = self.soup.find('div', attrs={'class': 'human-top'})

        if corporate_info and ('human' in corporate_info.a['href']):

            corporate_info = corporate_info.a
            corporate_name = corporate_info.get_text()
            corporate_link = corporate_info['href']
        
            print('Connect: GET %s' % corporate_link)
            resp = self.req_get(corporate_link)

            print('Parsing: Corporate Info')
            print()
            corporate_soup = BeautifulSoup(resp.content, "html5lib")
            companies = corporate_soup.find('div', attrs={'id': '_container_syjs'}).table.tbody.find_all('tr')

            for i in range(0,len(companies)):
                if companies[i].contents[0].contents[0].get_text():
                    company_role = companies[i].contents[0].contents[0].get_text()
                    company_name = companies[i].contents[1].contents[1].get_text()
                    company_province = companies[i].contents[2].get_text()
                    company_date = companies[i].contents[3].get_text()
                    company_capital = companies[i].contents[4].get_text()
                    company_status = companies[i].contents[5].get_text()
                else:
                    company_name = companies[i].contents[0].contents[1].get_text()
                    company_province = companies[i].contents[1].get_text()
                    company_date = companies[i].contents[2].get_text()
                    company_capital = companies[i].contents[3].get_text()
                    company_status = companies[i].contents[4].get_text()

                data.append([self.url_id, corporate_name, company_role, company_name, company_province, company_date, company_capital, company_status])
        
        else:
            data.append([self.url_id, '-', '-', '-', '-', '-', '-', '-'])

        return data


    def parse_finacing_info(self):   
        data = [['url_id', 'company_name', 'finacing_time', 'turn', 'appraisement', 'capital', 'propertion', 'invenstors']]
        header = self.soup.find('div', attrs={'class': 'company_header_width'})
        company_name = header.h1.get_text()
        finacing_link = header.contents[2]

        if finacing_link.contents:
            finacing_link = finacing_link.a['href']

            print('Connect: GET %s' % finacing_link)
            resp = self.req_get(finacing_link)

            print('Parsing: Finacing Info')
            print()
            finacing_soup = BeautifulSoup(resp.content, "html5lib")
            finacing_info = finacing_soup.find('div', attrs={'id': '_container_rongzi'})
            if finacing_info:
                finacing_table = finacing_info.tbody.contents

                for tr in finacing_table:
                    finacing_time = tr.contents[1].get_text()
                    turn = tr.contents[2].get_text()
                    appraisement = tr.contents[3].get_text()
                    capital = tr.contents[4].get_text()
                    propertion = tr.contents[5].get_text()
                    invenstors = tr.contents[6].get_text()
                    data.append([self.url_id, company_name, finacing_time, turn, appraisement, capital, propertion, invenstors])

            else:
                data.append([self.url_id, company_name, '-', '-', '-', '-', '-', '-'])   
        else:
            data.append([self.url_id, company_name, '-', '-', '-', '-', '-', '-'])

        return data
Пример #50
0
from prettytable import PrettyTable
def get_one_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

#def main():
list_file = open("movielist",'w')
table=PrettyTable(["排行","名称","主演","上映时间","评分"])
for num in range(0,5):
	url = 'http://maoyan.com/board/4'+'?offset='+str(num*10)
	html = get_one_page(url)
#print(html)
	soup =BeautifulSoup(html,'lxml')
	items=soup.find_all('dd')
	i=0
	for item in items:
		html_page=soup.find_all('dd')[i]
		index=html_page.find('i',class_='board-index').get_text()
		name=html_page.find('p',class_='name').a['title']
		star=html_page.find('p',class_='star').get_text().strip().strip('主演:')
		releasetime=html_page.find('p',class_='releasetime').get_text().strip().strip('上映时间:')
		score=html_page.find('i',class_='integer').get_text()		+html_page.find('i',class_='fraction').get_text()
		list_=str(index)+str(name)+str(star)+str(releasetime)+str(score)+'\n'
		table.add_row([index,name,star,releasetime,score])
		#print(index,name,star,releasetime,score)
		i=i+1	
print(table)
list_file.write(str(table))
Пример #51
0
 datalist.clear()
 listidx = 1
 while IsWhileGo:
     btns = driver.find_elements_by_class_name('summaryBtn')
     for btn in btns:
         while True:
             try:
                 time.sleep(0.5)
                 btn.click()
                 break
             except:
                 time.sleep(0.5)
     time.sleep(2)
     bs4 = BeautifulSoup(driver.page_source, 'lxml')
     lis = bs4.find('ul', id='jobNormal').find_all('li')
     smbs = bs4.find_all('div', class_='summaryView')
     summaryidx = 0
     alist = []
     alist.clear()
     titlelist = []
     titlelist.clear()
     for li in lis:
         title = li.find('span', class_='company').get_text().strip()
         href = baseurl + li.find('a')['href']
         dateday = li.find(
             'span', class_='regDate').get_text().split(':')[1].strip()
         if title not in titlelist:
             if dateday == '':
                 dateday = nowdate
             if endDate >= dateday and dateday >= standDate:
                 try:
Пример #52
0
def forum_topic_fetch(user_agent):
    """
      This funciton will fetch the catagories in the Naturally Curly forum page

      Keyword arguments:
      user_agent -- Using a different user agent than the default python one keeps the user from being kicked out by the website
    """
    #Make a get request to retrieve the page
    html_page = requests.get('https://curltalk.naturallycurly.com/', headers = {'User-Agent': user_agent} )
    soup = BeautifulSoup(html_page.content, 'html.parser')
    link_list_forum_top = []
    for link in soup.find_all('a'):
    link_list_forum_top.append(link.get('href'))
    categories = [s for s in link_list_forum_top if (("categories" in s) and ('https' in s))]
    return categories


def signature_fetch(categories, index_of_category_list, user_agent, start_range=0, finish_range=100):
    """
      This funciton will fetch the signatures from a forum topic

      Keyword arguments:
      catagories -- the list of category urls with no page number on them
      index_of_category_list -- which url in the list is deisred to be scraped
      user_agent -- using a different user agent than the default python one keeps the user from being kicked out by the website
      start_range -- default to start at page zero, but can be set to a different page number as it might need to be run a few times in approximatly 100 page increments
      finish_range -- default number of pages to stop scraping at, but this might need to be adjusted for very large or small number of pages on each topic url
    """
    # Get a list of specific discussion urls
    link_listdiscussion = []
    for i in range(start_range, finish_range):
        url = f'{categories[index_of_category_list]}/p{i}'
        html_page = requests.get(url, headers = {'User-Agent': user_agent} )

        # Check status code
        status = html_page.status_code
        if status != 200:
            print(f'Error improper response code. Code is {status}')

        # Pass the page contents to beautiful soup for parsing
        soup = BeautifulSoup(html_page.content, 'html.parser')
        # Create a list of discussions on each forum topic page
        for link in soup.find_all('a'):
            link_listdiscussion.append(link.get('href'))
            topics = [s for s in link_listdiscussion if (("/discussion" in s) and ('https' in s))]

    # Lets the user see the function is working should take a few minutes to get a result here depending on the range of pages looped through
    print(len(topics))

    # Loop through all the topics found for each catagory
    list_for_mongo = []
    count = 1
    for topic in topics:
        url2 = topic
        html_page2 = requests.get(url2, headers = {'User-Agent': user_agent} )

        # Check status code
        if status != 200:
            print(f'Error improper response code. Code is {status}')

        soup2 = BeautifulSoup(html_page2.content, 'html.parser')
        signatures = soup2.find_all('div', class_="Signature UserSignature userContent")

        for i in range(0,len(signatures)):
            sig = {}
            sig['signature'] = soup2.find_all('div', class_="Signature UserSignature userContent")[i].get_text()
            list_for_mongo.append(sig)
        count += 1

    return list_for_mongo
Пример #53
0
def ScrapTweets(user):
    print ('[+]' + G + ' Fetching Data {} From Twitter...'.format(user) + '\n'+W)
    link = "https://twitter.com/" + user
    r=requests.get(link)
    if r.status_code == 200:
        the_client = uReq(link)
        page_html = the_client.read()
        the_client.close()
        f=open('wordlist.txt','r')
        wordlist=[]
        predator=[]
        for i in f:
            for j in i.split():
                wordlist.append(j)
        f.close()
        soup = BeautifulSoup(page_html, 'html.parser')
        try:
            os.mkdir(user)
        except:
            pass
        f1=open("./{0}/{1}.txt".format(user,user),"w+")

        try:
            full_name = soup.find('a', attrs={"class": "ProfileHeaderCard-nameLink u-textInheritColor js-nav"})
            f1.write("\nUser Name : " + str(full_name.text))
        except:
            f1.write("\nUser Name -->"+ R +" Not Found")

        try:
            user_id = soup.find('b', attrs={"class": "u-linkComplex-target"})
            f1.write("\nUser Id : "+str(user_id.text))
        except:
            f1.write("\nUser Id : "+"Not Found")

        try:
            decription = soup.find('p', attrs={"class": "ProfileHeaderCard-bio u-dir"})
            f1.write("\nDescription : "+str(decription.text))
        except:
            f1.write("\nDecription not provided by the user")

        try:
            user_location = soup.find('span', attrs={"class": "ProfileHeaderCard-locationText u-dir"})
            f1.write("\nLocation :  " +  str(user_location.text.strip()))
        except:
            f1.write("\nLocation not provided by the user")

        try:
            connectivity = soup.find('span', attrs={"class": "ProfileHeaderCard-urlText u-dir"})
            title = connectivity.a["title"]
            f1.write("\nLink provided by the user : "******"\nNo contact link is provided by the user")

        try:
            join_date = soup.find('span', attrs={"class": "ProfileHeaderCard-joinDateText js-tooltip u-dir"})
            f1.write("\nThe user joined twitter on : " + str(join_date.text))
        except:
            f1.write("\nThe joined date is not provided by the user")

        try:
            birth = soup.find('span', attrs={"class": "ProfileHeaderCard-birthdateText u-dir"})
            birth_date = birth.span.text
            f1.write("\nDate of Birth:"+str(birth_date.strip()))
        except:
            f1.write("\nBirth Date not provided by the user")

        try:
            span_box = soup.findAll('span', attrs={"class": "ProfileNav-value"})
            f1.write("\nTotal tweets : " + span_box[0].text)
        except:
            f1.write("\nTotal Tweets : Zero")

        try:
            f1.write("\nFollowing : " +span_box[1].text)
        except:
            f1.write("\nFollowing : Zero")

        try:
            f1.write("\nFollowers : " + span_box[2].text)
        except:
            f1.write("\nFollowers : Zero")

        try:
            f1.write("\nLikes send by him : " + span_box[3].text)
        except:
            f1.write("\nLikes send by him : Zero")

        try:
            if span_box[4].text != "More ":
                f1.write("\nNo. of parties he is Subscribed to : " + span_box[4].text)
            else:
                f1.write("\nNo. of parties he is Subscribed to : Zero")
        except:
            f1.write("\nNo. of parties he is Subscribed to : Zero")
        f1.write(W)

        spana = soup.findAll('span', attrs={"class": "ProfileNav-value"})

        f1.write("\nTweets by "+ str(user) + " are : ")

        for tweets in soup.findAll('p', attrs={"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"}):
            tweets=bs2json().convert(tweets)
            try:
                f1.write(tweets['p']['text'])
                f1.write("\n")
            except:
                pass
            '''
            for i in range(len(tweets['p']['a'])-1,len(tweets['p']['a'])):
                try:
                    #print("http://"+str(tweets['p']['a'][i]['text']))
                    response = requests.get("http://"+tweets['p']['a'][i]['text'])
                    soup = BeautifulSoup(response.text, 'html.parser')
                    img_tags = soup.find_all('img')
                    s=str(img_tags).split()
                    print(s)
                    quit()
                except:
                    pass
            continue
            '''
            for i in range(0,len(tweets['p']['a'])):
                try:
                    f1.write(str(tweets['p']['a'][i]['s']['text'])+str(tweets['p']['a'][i]['b']['text']))
                    f1.write("\n")
                    if str(tweets['p']['a'][i]['b']['text']) in wordlist:
                        predator.append(user)
                        print(R+"{} May be a Predator".format(user)+W)
                except KeyError as e:
                    try:
                        if str(tweets['p']['a'][i]['text']).split()!=[]:
                            f1.write(tweets['p']['a'][i]['text'])
                            f1.write("\n")
                            response = requests.get("http://"+tweets['p']['a'][i]['text'])
                            soup = BeautifulSoup(response.text, 'html.parser')
                            img_tags = soup.find_all('img')
                            s=str(img_tags).split()
                            media=[]
                            for i in s:
                            	if "/media/" in i:
                            		media.append(i)
                            regex=r'https?:\/\/.*\.(?:png|jpg)'
                            for i in media:
                                matches=re.findall(regex,i)[0]
                                urllib.request.urlretrieve(matches,str(user)+"/"+str(matches[-19:]))
                        else:
                            pass
                    except KeyError as e:
                        pass
                else:
                    pass
            f1.write("\n")
        f1.close()
        print("Fetched Details are Saved at "+"./{0}/{1}.txt".format(user,user))
        if len(predator)>0:
            print(R+"\nPredator Identity Details:\n")
            for i in predator:
                ScrapTweets(i)
                arr=os.listdir("./{}".format(str(i)))
                for j in arr:
                    if re.match(r".+\.jpg",j):
                        if imageai("./{}".format(str(i))+"/"+j) == True:
                            print(R+"{} Is a Predator".format(str(i))+W)
            print("Fetched Details are Saved at "+"./{0}/{1}.txt".format(str(i),str(i)))
            for i in predator:
                print("./{0}/{1}.txt".format(i,i))
                f=open("./{0}/{1}.txt".format(i,i),'r')
                message=f.read()
                f.close()

                #AutoMail Generated
                mail(message)
        else:
            print(R+"\nUser Profile Details:\n"+W)
            print("Fetched Details are Saved at "+"./{0}/{1}.txt".format(user,user))
            f=open("./{0}/{1}.txt".format(user,user),'r')
            f.seek(0)
            message=f.read()
            f.close()

            #AutoMail Generated
            mail(message)
    elif r.status_code == 404:
        print(R+"Error: Profile Not Found")
        exit()
    else:
        print(R+"Error: Something Went Wrong")
        exit()
import bs4, requests
from bs4 import BeautifulSoup

r = requests.get(
    "http://pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/")
c = r.content
soup = BeautifulSoup(c, "html.parser")
#print(soup.prettify())
#cached version URL -> http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=10.html
all = soup.find_all("div", {"class": "propertyRow"})
x = all[0].find("h4", {"class", "propPrice"}).text.replace("\n", "")
page_nr = soup.find_all("a", {"class": "Page"})[-1].text
print(page_nr)
l = []
for page in range(0, int(page_nr) * 10, 10):
    base_url = "http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="
    print(base_url)
    r = requests.get(base_url + str(page) + ".html?v=0")
    c = r.content
    soup = BeautifulSoup(c, "html.parser")
    all = soup.find_all("div", {"class": "propertyRow"})

    for item in all:
        d = {}
        d["Address"] = item.find_all("span",
                                     {"class", "propAddressCollapse"})[0].text
        d["Price"] = item.find("h4", {
            "class": "propPrice"
        }).text.replace("\n", "")  #oterates thru all the prices
        d["Locality"] = item.find_all("span",
                                      {"class", "propAddressCollapse"})[1].text
Пример #55
0
from bs4 import BeautifulSoup
import requests 

response = requests.get("https://www.finda.co.kr/savings/p2p-investments")
soup = BeautifulSoup(response.text, "html.parser")

rates = soup.find_all('li', 'h3', 'span')

for rate in rates:
	print(rate.get_text())
Пример #56
0
from bs4 import BeautifulSoup

soup = BeautifulSoup(open('alice.html'), 'html.parser')

print soup.title
print soup.title.string
print soup.p
print soup.p['class']
print soup.find_all('a')
print soup.find(id='link3')
Пример #57
0
answerLocation = os.path.join(fileDir, '../dataset/answers.txt')

temp = {}

with open(questionLocation, 'r') as input, open('questions-cleaned.txt', 'w') as output:
    lines = input.readlines()
    numberOfQuestions = len(lines)
    counter = 0

    for line in lines:
        counter += 1
        temp = eval(line)
        soup = BeautifulSoup(temp['question'], 'html.parser')

        # Remove any code sections
        codeSents = soup.find_all('pre')
        for codeSent in codeSents:
            codeSent.extract()

        # soup.text returns clean text, free from html tags
        # Cleanup trailing spaces and newline characters
        cleanText = soup.text.encode('ascii', 'ignore')
        cleanText = re.sub(r'(\n)+', ' ', cleanText).rstrip()

        temp.update({'question': cleanText.encode('ascii', 'ignore')})
        output.write(str(temp))

        if counter != numberOfQuestions:
            output.write('\n')

answerDicts = []
Пример #58
0
def main(blog_id, dl_path, fast=False):
	global REFERER
	REFERER = REFERER.format(ISRABLOG_HOSTNAME, blog_id)

	logging.info("WORKING_DIR is %s", WORKING_DIR)
	print("Starting download of blog {} to destination {} (fast={}).".format(blog_id, dl_path, fast))
	if not os.path.exists(dl_path):
		os.makedirs(dl_path)

	logging.info("Copying %s to %s", INJECT_DIR, dl_path)
	copy_tree(INJECT_DIR, dl_path)

	post_ids = []

	# Main Page
	print("Downloading main page...")
	raw = dl_file(get_url(blog_id=blog_id), get_local_path(dl_path=dl_path))
	main_soup = BeautifulSoup(raw, 'html.parser')
	main_soup = dl_and_replace_external_resources(main_soup, dl_path, fast=fast)

	# sidebar
	print("Downloading sidebar page(s)...")
	for tag in main_soup.find_all('iframe', src=blog_readlist_regex):
		m = re.match(blog_readlist_regex, tag['src'])

		raw = dl_file(get_url(blog_id=blog_id, intent='sidebar', ListColumns=m.group(1), SideGroup=m.group(2)), get_local_path(intent='sidebar', dl_path=dl_path, sidebar_cols=m.group(1), sidebar_group=m.group(2)))
		raw = raw.replace('</body>', '<script type="text/javascript" src="iframeResizer.contentWindow.min.js"></script></body>')  # adding iframeResizer
		soup = BeautifulSoup(raw, 'html.parser')
		dl_and_replace_external_resources(soup, dl_path, fast=fast)
		replace_internal_resources(soup, saveTo=get_local_path(intent='sidebar', dl_path=dl_path, sidebar_cols=m.group(1), sidebar_group=m.group(2)))

	# Board List
	print("Downloading board list...")
	pagenum = 1
	while True:
		raw = dl_file(get_url(blog_id=blog_id, intent='board_list', page=pagenum), get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path))
		soup = BeautifulSoup(raw, 'html.parser')
		if soup.find('a', href='?blog={}&page=2'.format(blog_id, pagenum + 1)):
			replace_internal_resources(soup, saveTo=get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path))
			pagenum += 1
		else:
			replace_internal_resources(soup, saveTo=get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path))
			pagenum += 1
			raw = dl_file(get_url(blog_id=blog_id, intent='board_list', page=pagenum), get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path))
			soup = BeautifulSoup(raw, 'html.parser')
			replace_internal_resources(soup, saveTo=get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path))
			break

	# Archive Dates
	archive_dates = [x.get('value') for x in main_soup.find('select', id="PeriodsForUser").find_all('option')]
	archive_dates.sort(key=lambda x: x.split('/')[1] + "{:02d}".format(int(x.split('/')[0])), reverse=True)
	print("Downloading archive pages...")
	if ENABLE_PROGRESSBAR:
		bar = progressbar.ProgressBar(max_value=len(archive_dates)).start()
	for i, date in enumerate(archive_dates):
		pagenum = 1
		pages_count = 1
		next_month = archive_dates[i - 1] if i > 0 else None
		previous_month = archive_dates[i + 1] if i < len(archive_dates) - 1 else None

		while pagenum <= pages_count:
			month, year = date.split('/')
			raw = dl_file(get_url(blog_id=blog_id, month=month, year=year, pagenum=pagenum), get_local_path(year=year, month=month, pagenum=pagenum, dl_path=dl_path))

			soup = BeautifulSoup(raw, 'html.parser')
			for tag in soup.find_all('a', href=re.compile('javascript:showCommentsHere')):
				post_ids.append(tag['href'].split('(')[1].split(',')[0])

			t = soup.find('script', text=re.compile('navigateCount'))
			pages_count = int(t.text.strip().split('=')[1].strip(';')) if t else 1
			logging.info("Pages count for {}/{}: {}".format(year, month, pages_count))

			dl_and_replace_external_resources(soup, dl_path, fast=fast)
			replace_internal_resources(soup, next_month=next_month, previous_month=previous_month, saveTo=get_local_path(year=year, month=month, pagenum=pagenum, dl_path=dl_path))

			pagenum += 1

		if ENABLE_PROGRESSBAR:
			bar.update(i)
	if ENABLE_PROGRESSBAR:
		bar.finish()

	# Save Main Page
	replace_internal_resources(main_soup, previous_month=archive_dates[1] if len(archive_dates) > 1 else None, saveTo=get_local_path(dl_path=dl_path))

	# Posts
	print("Downloading posts...")
	if ENABLE_PROGRESSBAR:
		bar = progressbar.ProgressBar(max_value=len(post_ids)).start()
	for i, postid in enumerate(post_ids):
		raw = dl_file(get_url(blog_id=blog_id, intent='posts', postid=postid), get_local_path(intent='posts', postid=postid, dl_path=dl_path))
		soup = BeautifulSoup(raw, 'html.parser')
		dl_and_replace_external_resources(soup, dl_path, fast=fast)
		replace_internal_resources(soup, saveTo=get_local_path(intent='posts', postid=postid, dl_path=dl_path))

		if ENABLE_PROGRESSBAR:
			bar.update(i)
	if ENABLE_PROGRESSBAR:
		bar.finish()

	# Comments
	print("Downloading comment pages...")
	if ENABLE_PROGRESSBAR:
		bar = progressbar.ProgressBar(max_value=len(post_ids)).start()
	for i, postid in enumerate(post_ids):
		pagenum = 1

		raw = dl_file(get_url(blog_id=blog_id, intent='comments', postid=postid), get_local_path(intent='comments', postid=postid, dl_path=dl_path))
		soup = BeautifulSoup(raw, 'html.parser')
		t = soup.find('table', id="Table3")
		pages_count = int(t.td.text.strip().split(' ')[-2]) if t else 1
		logging.info("Comment Pages count for {}: {}".format(postid, pages_count))
		dl_and_replace_external_resources(soup, dl_path, fast=fast)
		replace_internal_resources(soup, saveTo=get_local_path(intent='comments', postid=postid, dl_path=dl_path))

		pagenum += 1
		while pagenum <= pages_count:
			raw = dl_file(get_url(blog_id=blog_id, intent='comments', postid=postid, posnew=pagenum), get_local_path(intent='comments', postid=postid, pagenum=pagenum, dl_path=dl_path))
			soup = BeautifulSoup(raw, 'html.parser')
			dl_and_replace_external_resources(soup, dl_path, fast=fast)
			replace_internal_resources(soup, saveTo=get_local_path(intent='comments', postid=postid, pagenum=pagenum, dl_path=dl_path))
			pagenum += 1

		if ENABLE_PROGRESSBAR:
			bar.update(i)
	if ENABLE_PROGRESSBAR:
		bar.finish()

	print(colorama.Fore.GREEN + "Done!" + colorama.Style.RESET_ALL)
Пример #59
0
#
# urllib2.urlopen(url, "", 100000)
#
# re.findall("")

html_doc = """ <div class="J-next-auto hide next-auto"><em>3</em> 秒后播放下一节</div>
                            <div class="J-next-btn hide next-auto btn btn-green">下一节</div>
                            <a href="/video/10687/0" class="review-course">重新观看</a>
                            
                            <div id="js-ques-box"></div>                        </div>

                                    </div>
"""
soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
print  '获取链接'
links = soup.find_all('a')
for link in links:
    print link.name, link['href'], link.get_text()

print '获取指定url'
link_node = soup.find('a', href='/video/10687/0')
print link_node.name, link_node['href'], link_node.get_text()

print '正则匹配'
link_node = soup.find('a', href=re.compile(r'video'))
print link_node.name, link_node['href'], link_node.get_text()

print '获取div'
link_node = soup.find('div', class_='J-next-auto hide next-auto')
print link_node.name, link_node.get_text()
Пример #60
0
def download(id_all):
    if not os.path.exists('aicai'):
        os.mkdir('aicai')
    it=iter(range(1,15))
    for id  in id_all:
        result = requests.get(url_download+id, headers=header,stream=True)
        #print(result.headers)
        filename=str(next(it))+'.xls'
        with open('./aicai/'+filename,'wb+') as f:
            f.write(result.content)
url_collect= 'https://live.aicai.com/pages/bf/sfc.shtml'
url_download='https://live.aicai.com/bf/bfindex!export.htm?matchBFId='

result=requests.get(url_collect,headers=header)
soup=BeautifulSoup(result.text,'html.parser')
all_url=soup.find_all('div',{'class':'bf_ta_tit'})
all_url2=[]
for ii in all_url:
    all_url2.append(ii.find('a')['value'])
all_url=all_url2
id_all=[]
#print(all_url)
for x in all_url:
    x=str(x)
    # a,b=x.split('=')[2:4]
    # a=a.split('&')[0]
    # b=b.split('&')[0]
    id_all.append(x[39:47])

download(id_all)
print('ok!')