def show_options(id): r = requests.get("https://interaktiv.mx.dk/toolbox/" + votetype + "/get/" + id) soup2 = BeautifulSoup(r.text, "lxml") clear_console() print_logo() print "(Interaktiv version. Kør scriptet med -h eller --help for flere indstillinger.)" print vote_text = soup2.find("div", attrs={"id": "vote_text"}).text print vote_text print if votetype == "advancedvotes": for option in soup2.find_all("div", attrs={"class": "vote_button"}): number = option.get("data-vote") text = option.text print "(%s) %s" % (number, text) print else: for option in soup2.find_all("div", attrs={"class": "vote_button"}): if option.get("id") == "vote_yes": number = "1" else: number = "0" text = option.text print "(%s) %s" % (number, text) print
def replace_links_with_text(html): """any absolute links will be replaced with the url in plain text, same with any img tags """ soup = BeautifulSoup(html, 'html5lib') abs_url_re = r'^http(s)?://' images = soup.find_all('img') for image in images: url = image.get('src', '') text = image.get('alt', '') if url == '' or re.match(abs_url_re, url): image.replaceWith(format_url_replacement(url, text)) links = soup.find_all('a') for link in links: url = link.get('href', '') text = ''.join(link.text) or '' if text == '': # this is due to an issue with url inlining in comments link.replaceWith('') elif url == '' or re.match(abs_url_re, url): link.replaceWith(format_url_replacement(url, text)) return force_text(soup.find('body').renderContents(), 'utf-8')
def write_into_text_nw(url): order_links = get_hyper_links(url, 'sxgzjl/2015') for link in order_links: link = NWORIGINURL + link data = get_page_source(link) if data[0] == "200": soup = BeautifulSoup(data[1]) try: for page_title in soup.find_all("b"): if page_title.string is not None: pre_title = page_title.string + '.txt' pre_title = pre_title.replace("\r\n", "") print pre_title html_file = codecs.open(pre_title, 'wb', 'utf-8') html_file.write(unicode(page_title.string)) html_file.write('\n') for page_content in soup.find_all("p"): page_content_unicode = unicode(page_content.get_text()) html_file.write(page_content_unicode) html_file.write('\n') html_file.close() except Exception,e: print str(e) # return None continue
def from_pmml(self, pmml): """Returns a model with the intercept and coefficients represented in PMML file.""" model = self() # Reads the input PMML file with BeautifulSoup. with open(pmml, "r") as f: lm_soup = BeautifulSoup(f, "xml") if not lm_soup.RegressionTable: raise ValueError("RegressionTable not found in the input PMML file.") else: ##### DO I WANT TO PULL THIS OUT AS ITS OWN FUNCTION? ##### # Pulls out intercept from the PMML file and assigns it to the # model. If the intercept does not exist, assign it to zero. intercept = 0 if "intercept" in lm_soup.RegressionTable.attrs: intercept = lm_soup.RegressionTable['intercept'] model.intercept_ = float(intercept) # Pulls out coefficients from the PMML file, and assigns them # to the model. if not lm_soup.find_all('NumericPredictor'): raise ValueError("NumericPredictor not found in the input PMML file.") else: coefs = [] numeric_predictors = lm_soup.find_all('NumericPredictor') for i in numeric_predictors: i_coef = float(i['coefficient']) coefs.append(i_coef) model.coef_ = numpy.array(coefs) return model
def search(engine, searchurl, Length): print u'正在分析搜索结果页面' if engine=="Bing": only_a_tags = SoupStrainer("li", class_="b_algo") a = urllib.urlopen('http://cn.bing.com/search?q="'+searchurl+'"').read() elif engine == "Baidu": only_a_tags = SoupStrainer("h3") a = urllib.urlopen('http://www.baidu.com/s?wd="'+searchurl+'"&rn='+Length).read() elif engine == "Google": only_a_tags = SoupStrainer("h3") a = urllib.urlopen('http://www.google.com/search?num='+Length+'&q="'+searchurl+'"').read() html = cleaner.clean_html(a) soup = BeautifulSoup(html, "lxml", parse_only=only_a_tags) if engine=="Bing": tag = soup.find_all("li", class_="b_algo") elif engine == "Baidu" or engine == "Google": tag = soup.find_all("h3") group = [] for item in tag: group.append(item.a) if len(group)==0: return [['/'],[u'您搜索的关键字没有结果。点击此链接返回首页。']] else: url=['' for col in range(len(group))] title=['' for col in range(len(group))] for k in range(0,len(group)): if engine=="Bing": url[k]=group[k]['href'] elif engine == "Baidu": url[k]='http:'+group[k]['href'] elif engine == "Google": url[k]='https://www.google.com'+group[k]['href'] title[k]=group[k].get_text().strip() print u'分析页面已完成。共有', len(group),u'个页面需要提取' return [url, title]
def get_text_from_html(html_text): """Returns the content part from an HTML document retains links and references to images and line breaks. """ soup = BeautifulSoup(html_text, 'html5lib') # replace <a> links with plain text links = soup.find_all('a') for link in links: url = link.get('href', '') text = ''.join(link.text) or '' link.replaceWith(format_url_replacement(url, text)) # replace <img> tags with plain text images = soup.find_all('img') for image in images: url = image.get('src', '') text = image.get('alt', '') image.replaceWith(format_url_replacement(url, text)) # extract and join phrases body_element = soup.find('body') filter_func = lambda s: bool(s.strip()) phrases = map( lambda s: s.strip(), filter(filter_func, body_element.get_text().split('\n')) ) return '\n\n'.join(phrases)
def moderate_tags(html): """replaces instances of <a> and <img> with "item in moderation" alerts """ from askbot.conf import settings soup = BeautifulSoup(html, 'html5lib') replaced = False if settings.MODERATE_LINKS: links = soup.find_all('a') if links: template = get_template('widgets/moderated_link.jinja') aviso = BeautifulSoup(template.render(), 'html5lib').find('body') map(lambda v: v.replaceWith(aviso), links) replaced = True if settings.MODERATE_IMAGES: images = soup.find_all('img') if images: template = get_template('widgets/moderated_link.jinja') aviso = BeautifulSoup(template.render(), 'html5lib').find('body') map(lambda v: v.replaceWith(aviso), images) replaced = True if replaced: return force_text(soup.find('body').renderContents(), 'utf-8') return html
def spider_search(url): print url out_name = url.split('|')[1] url = url.split('|')[0] r = requests.get(url) if r.status_code == 200: pass else: print '访问页面错误: ' + r.status_code return source = BeautifulSoup(r.content, 'html.parser') # 保存源网页 with codecs.open(out_name+'.html', 'w', 'utf-8') as f: f.write(source.prettify()) content = source.find_all('a', target='_blank') links = source.find_all('a') for item in links: if 'title' in item.attrs and 'href' in item.attrs: name = item['title'] link = url + item['href'] + '/articles/' + '|' + name global red red.lpush(settings.DBname, link) spider_content_search(content, out_name+'.baike')
def parse(self, response): url = response.url _type = self.get_type_from_url(url) items = [] try: response = response.body soup = BeautifulSoup(response) links = soup.find_all(class_=re.compile('post-area')) except: items.append(self.make_requests_from_url(url)) log.msg("Page " + url + " parse ERROR, try again !", level=log.ERROR) return items need_parse_next_page = True if len(links) > 0: for i in range(0, len(links)): url_news = 'http://www.nanzao.com' + links[i].h2.a['href'] title = links[i].h2.a.text.strip() day = links[i].time['datetime'].replace('-', '') need_parse_next_page = self.is_news_not_saved(title, url_news) if not need_parse_next_page: break items.append(self.make_requests_from_url(url_news).replace(callback=self.parse_news, meta={'_type': _type, 'day': day, 'title': title})) if u'下一頁>' in soup.find(class_='paging').text: page_next = 'http://www.nanzao.com' + soup.find_all("a", text=u"下一頁>")[0]['href'] if need_parse_next_page: items.append(self.make_requests_from_url(page_next)) return items
class PageAnalytic(object): headers={"User-Agent": "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11", "Accept-Encoding": "gzip"} tagsClass = TagProcessor.__subclasses__() def __init__(self, url): self.base_url = url response = requests.get(url, headers=self.headers) self.html_size = int(response.headers["content-length"]) self.page_object = BeautifulSoup(response.content) self._tags = [] def _get_tag_size(self, tag): for subCls in self.tagsClass: if subCls.is_resource(tag): self._tags.append(subCls(tag, base_url=self.base_url)) return True else: return False def get_size_tag(self): size = 0 self.page_object.find_all(self._get_tag_size) for tag in self._tags: #threading? resp = requests.head(tag.get_resource_url(), headers=self.headers) while resp.status_code == 301 or resp.status_code == 302: resp = requests.head(resp.headers["location"]) if resp.status_code == 200: size += int(resp.headers["content-length"]) self._tags = [] return size def get_page_size(self): return self.html_size + self.get_size_tag()
def __init__(self, username): # go out with name and check if it's real, populate the variable self.name = username self.u_url = "http://www.pinterest.com/" + username #go out and parse html to find board names self.boards = [] r = requests.get(self.u_url) data = r.text soup = BeautifulSoup(data) titles = [] urls = [] pics = [] for link in soup.find_all('img', 'boardCover'): if(link.get('alt')): title_s = link.get('alt') titles.append(filter(lambda x: x in string.printable, (title_s.split(' / '))[0])) #error? pics.append(link.get('src')) # the url isn't in this tag, where is it? for link in soup.find_all('a', 'boardLinkWrapper'): urls.append(link.get('href')) for i in range(0, len(urls) - 1): self.boards.append(board(titles[i], urls[i], pics[i])) return
def get_details(html): soup=BeautifulSoup(html) #得到作者、作者链接、微博正文 div_content=soup.find_all(attrs={'class': 'content clearfix'}) #得到发微博时间 div_time=soup.find_all(attrs={'class':'feed_from W_textb'}) #将用户名称,用户主页地址、微博正文、发微博时间初始化 nick_name=[] nickname_href=[] content_text=[] time=[] #print get_content[0] for i in range(len(div_content)): #查找a标签 a_tag=div_content[i].find('a') nick_name.append(a_tag.get('nick-name')) nickname_href.append(a_tag.get('href')) #查找p标签 p_tag=div_content[i].find('p') content_text.append(p_tag.get_text()) #得到发微博时间 for j in range(len(div_time)): a_time=div_time[j].find('a') time.append(a_time.get('title')) return (nick_name,nickname_href,content_text,time)
def get_item_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,'lxml') print(url)#追踪出错url if soup.find_all(src=" http://sta.ganjistatic1.com/src/image/v5/expire.png"):#物品已售出,信息过期,如http://bj.ganji.com/shouji/1303939091x.htm print('信息已过期') else: if soup.select('div.error'):#错误界面 pass else: if soup.select('h1.title-name'):#会出现此类界面,判断后跳过:http://bj.ganji.com/ershoubijibendiannao/386647282x.htm title = soup.select('h1.title-name')[0].text #title = soup.title.text time = soup.select('i.pr-5')[0].text.strip().split('发布')[0] type = soup.select('ul.det-infor > li > span > a')[0].text price = soup.select('i.f22.fc-orange.f-type')[-1].text if soup.find_all('i','f22 fc-orange f-type') else None area_list = soup.select('div.leftBox > div > div > ul > li')[9] area = ''.join(list(area_list.stripped_strings)) seller = soup.select('span.fc-orange')[0].text if seller == '': seller = '个人' else: seller = '商家' item_info.insert_one({'title':title,'time':time,'type':type,'price':price,'area':area,'seller':seller}) print({'title':title,'time':time,'type':type,'price':price,'area':area,'seller':seller}) else: print('特殊页面')
def crawling(link, level): print "LEVEL: ",level html = urllib.urlopen(link) soup = BeautifulSoup(html) for email in soup.find_all('a'): email=email.get('href') if email is not None: check=email.encode('ascii', 'ignore') if ( (re.search( r'mailto:\w+',check) ) or (re.search( r'\w+@\w+\.\w{2,6}\.?\w*',check) ) ): emailList.append(check) for url in soup.find_all('a'): #print url.get('href') url = url.get('href') if url is not None: url=url.encode('ascii', 'ignore') if ( not(re.search( r'http[s]?:\/\/[w.-]*', url)) and (re.search( r'\/[w.-/]*', url)) ): if (link[-1:]== '/'): link = link[:1] url = link+url print url if ( (re.search( r'http:\/\/[\w.-]*', url ))): if (not(re.search(r'http:\/\/(google|\w+\.google|www\.youtube|goo\.gl|www\.facebook\.com|www\.twitter\.com|t\.co)[w.-]*', url))): if (level<1): if url not in urlChecked: urlChecked.append(url) crawling(url, level+1)
def get_long_url(url): num_1 = 0 num_2 = 0 try: r = requests.get('https:'+url,headers=HeaderData.get_header(),timeout=60) s = BeautifulSoup(r.content,"lxml") ptag = s.find_all("p",attrs={"class":"view mb20"}) atag = s.find_all("a",attrs={"class":"careful"}) for a in atag: uri = a.get('href') if 'qzbd' in uri: num_1 += 1 if num_1 > 3: break else: get_long_url(uri) else: f.write(uri+'\n') print uri for p in ptag: uri = p.find('a').get('href') if 'qzbd' in uri: num_2 += 1 if num_2 > 3: break else: get_long_url(uri) else: f.write(uri+'\n') print uri except Exception,e: print e
def extract(self, html): "Extract data from meta, link and title tags within the head tag." extracted = {} soup = BeautifulSoup(html, 'html.parser') # extract data from title tag title_tag = soup.find('title') if title_tag: extracted['titles'] = [title_tag.string] # extract data from meta tags for meta_tag in soup.find_all('meta'): if 'name' in meta_tag.attrs and 'content' in meta_tag.attrs: name = meta_tag['name'] if name in self.meta_name_map: name_dest = self.meta_name_map[name] if name_dest not in extracted: extracted[name_dest] = [] extracted[name_dest].append(meta_tag.attrs['content']) # extract data from link tags for link_tag in soup.find_all('link'): if 'rel' in link_tag.attrs: if ('canonical' in link_tag['rel'] or link_tag['rel'] == 'canonical') and 'href' in link_tag.attrs: if 'urls' not in extracted: extracted['urls'] = [] extracted['urls'].append(link_tag['href']) elif ('alternate' in link_tag['rel'] or link_tag['rel'] == 'alternate') and 'type' in link_tag.attrs and link_tag['type'] == "application/rss+xml" and 'href' in link_tag.attrs: if 'feeds' not in extracted: extracted['feeds'] = [] extracted['feeds'].append(link_tag['href']) return extracted
def parse_update_data(html): soup_outer = BeautifulSoup(html, "lxml", from_encoding="utf-8") for tr in soup_outer.find_all(name="tr")[1:]: td = tr.find_all(name="td") film = td[0].get_text().strip() developer = td[1].get_text().strip() dilution = td[2].get_text().strip() asa_iso = td[3].get_text().strip() a35mm = td[4].get_text().strip() a120 = td[5].get_text().strip() sheet = td[6].get_text().strip() temp = td[7].get_text().strip() try: notes_link = td[8].find(name="a").get("href") notes_link = "http://www.digitaltruth.com/"+notes_link notes_html = urllib2.urlopen(notes_link).read() soup_inner = BeautifulSoup(notes_html, "lxml", from_encoding="utf8") notes = soup_inner.find_all(name="tr")[1].find_all(name="td")[-1].get_text().strip() except AttributeError: notes = "" from myrobot.models import FilmSearch try: obj = FilmSearch.objects.get(Film=film, Developer=developer, Dilution=dilution, ASA_ISO=asa_iso, Temp=temp) obj.a35mm = a35mm obj.a120 = a120 obj.sheet = sheet obj.Notes = notes obj.save() except FilmSearch.DoesNotExist: obj = FilmSearch(Film=film, Developer=developer, Dilution=dilution, ASA_ISO=asa_iso, a35mm=a35mm, a120=a120, sheet=sheet, Notes=notes) obj.save()
def processticker(ticker, file_name, date_int, listview): base_url = "http://finance.yahoo.com/q/op" num_of_tries = 0 payload = {"s": ticker, "date": date_int} r = requests.get(base_url, params=payload) data = r.text soup = BeautifulSoup(data, "lxml") option_list = [] expiration_dictionary = {} while num_of_tries < 20: try: for pair in soup.find_all("option"): expiration_dictionary[pair.get_text()] = yahoo_url + pair["data-selectbox-link"] for n in soup.find_all("script"): option_list.append(n) raw_options_chain = str(option_list.pop(16)) start_call_options = [a.start() for a in list(re.finditer("calls", raw_options_chain))] endoptions = [a.start() for a in list(re.finditer("_options", raw_options_chain))] raw_options_chain = raw_options_chain[start_call_options[0] - 2 : endoptions[0] - 2] options_json = json.loads(raw_options_chain) # Extract puts/calls as JSON objects. put_list = options_json["puts"] call_list = options_json["calls"] print(call_list) create_csv(call_list, put_list, file_name, listview) except IndexError: num_of_tries += 1 continue break
def write_txt_category(category, list_category, in_path="../elife-articles/", out_path="../articles/"): "Go throught the list and use BS to get text content." for article in list_category: print in_path+article article_text = "" soup = BeautifulSoup(open(in_path+article), ["lxml", "xml"]) abstract_tag = soup.find_all("abstract") body_tag = soup.find_all("body") for b in body_tag[0]: try: article_text += " " + b.p.text except: for i in b: try: article_text += " " + i.p.text except: article_text += " " + i.string if abstract_tag: for abstract in abstract_tag: abstract_text = abstract.p.text article_text += "\n" + abstract_text for a in article_text: if a in spe_char.keys(): final_article = article_text.replace(a, unicode(spe_char.get(a))) if not category in listdir(out_path): mkdir(out_path+category) print category, " folder created!" file_path = out_path + category + "/" + article.replace("xml", "txt") # Write to a text file: with codecs.open(file_path, mode='w', encoding='utf-8') as f: f.write(final_article) print "Wrote %d xml articles to text format." % len(list_category[:1])
def htmltitleParse(): DATA={} count=0 with open("./filestore/titleIndex_file_input.txt",'w') as f_write: for i in range(1,72140): try: with open("./filestore2/%d.txt" %i,'r') as f: valueList=f.readlines() DATA['url']=valueList[0].strip() DATA['html']=valueList[2].strip() title='' #http://ubuntuforums.org/showthread.php?t=1215158 try: html=BeautifulSoup(DATA['html'].strip()) if html.find_all('title'): for sub in html.find_all('title'): title=title+sub.get_text().encode('utf-8')+' ' title=tokenizeString(title) #print title f_write.write(DATA['url']+' '+title+'\n') except IndexError: pass except IOError: pass return
def trans_extra_data_fee(members): # need to judge there is extra usage owner_phone = "310-600-0358"; data_plan = 20.0; usage_quota = data_plan * .1; soup_data = BeautifulSoup(open("billusage.htm"), "html.parser") extra_datausage = float(soup_data.find_all("div", {"class": "additionalColCenter"})[1].text.split('M')[0]) / 1024 # in GB total_datausage = data_plan + extra_datausage ths = soup_data.find_all("th", {"class": "PadTop0 BotSolidBorder borderRightSolid borderLeftSolid left", "headers": "header1"}) extra_data_usage_dict = {}; #print(ths[0].text[4:-1]) real_extra_usage = total_datausage * float(ths[0].text[4:-1]) / 100 - usage_quota extra_data_usage_dict[owner_phone] = real_extra_usage if real_extra_usage >= 0 else 0 total_extra_percent = extra_data_usage_dict[owner_phone] for i in range(1, 10): phone = ths[i].text[0:12] phone = trans_phone_format2(phone) #print(phone) real_extra_usage = total_datausage * float(ths[i].text[13:-1]) / 100 - usage_quota extra_data_usage_dict[phone] = real_extra_usage if real_extra_usage >= 0 else 0 total_extra_percent += extra_data_usage_dict[phone] extra_fee_holder = soup_data.find_all("div", {"class": "additionalColRight"}) extra_fee = float(extra_fee_holder[1].text[1:]) #print('===========') for user, info in members.iteritems(): members[user]['data'] = members[user]['data'] - extra_fee / 10 + extra_fee * extra_data_usage_dict[user] / total_extra_percent #print(user) #print(extra_fee * extra_data_usage_dict[user] / total_extra_percent) #print(members[user]['data']) return members;
def listar_lancamentos(url): codigo_fonte = abrir_url(url).result soup = BeautifulSoup(codigo_fonte) miniaturas = str(soup.find_all('div', class_='lancamentoBoxNome')) match = re.compile(r'<a href="(.+?)" style=".*?" title="(.+?)">').findall(miniaturas) img = re.compile(r'<img height=".*?" src="(.+?)" style=".*?" width=".*?">').findall(miniaturas) # Obter quantidade de paginas paginacao = str(soup.find_all('div', class_='paginacao')) match_pag = re.compile(r'<a href="(.+?)" title="(.+?)">').findall(paginacao) a = [] for x in range(0, len(match)): temp = [match[x][0], match[x][1], img[x]] a.append(temp) total = len(a) for url2, titulo, img in a: addDir(titulo, url2, 4, img, True, total) try: n = re.search(r'http://www.superanimes.com/.+?\?&pagina=(.?)', url).group(1) except: url = url + '?&pagina=1' n = 1 n = int(n) if n <= len(match_pag): m = n+1 prox_pag = url.replace(str(n), str(m)) addDir('Proxima Pagina >>>', prox_pag, 6, artfolder + 'destaques.png')
def page_loop(n): for i in range(1,n): print 'Now printing page',str(i) url = 'http://wanimal1983.tumblr.com/page/'+str(i) content = urllib2.urlopen(url) soup = BeautifulSoup(content,'lxml') girls_sets = soup.find_all('div',class_='photoset-grid') girls = soup.find_all('div',class_='media') # single pic for girl in girls: pic = girl.find('img') name=pic.get('alt') name=name.strip() link = pic.get('src') flink = link print name # print girl content2 = urllib2.urlopen(flink).read() with open('wanimal'+'/'+name+flink[-28:]+'.jpg','wb') as code: code.write(content2) # pic sets for girl_set in girls_sets: # print girl_set pic1 = girl_set.find_all('img') for pic2 in pic1: link1 = pic2.get('src') flink1 = link1 print flink1 content3 = urllib2.urlopen(flink1).read() with open('wanimal'+'/'+flink1[-28:],'wb') as code: code.write(content3)
def pre_parse(filename, overwrite=False): with open(filename) as infile: soup = BeautifulSoup(infile, 'xml') for li in soup.find_all('li'): assert li.parent.name == 'ul' li.unwrap() for bold in soup.find_all('bold'): bold.name = u'b' for italic in soup.find_all('italic'): italic.name = u'i' for chapter in soup.find_all('chapter'): footnotes = chapter.find_all('ftnote') for footnote in footnotes: next_sibling = footnote.next_sibling if next_sibling is None: break while next_sibling.name != 'ftnote': footnote.append(next_sibling) if isinstance(next_sibling, Tag): next_sibling.unwrap() next_sibling = footnote.next_sibling if next_sibling is None: break if not overwrite: filename = filename.replace('.xml', '_copy.xml') with codecs.open(filename, 'w', 'utf-8') as outfile: outfile.write(unicode(soup))
def getAnswerer(question_id): # get html of the question page first # r = s.get('http://www.zhihu.com/question/22968659') r = s.get('http://www.zhihu.com/question/' + str(question_id)) bs = BeautifulSoup(r.text) # question title a = bs.find_all('h2',{'class':'zm-item-title'}) print a[0].text # data-aid,用于获取full voter info data_aid = re.findall('data-aid="(.*)"',r.text) # 用户id、用户名 a = bs.find_all('h3',{'class':'zm-item-answer-author-wrap'}) answerer_id = [] answerer_name = [] for i in range(len(a)): answerer_name.append(a[i].text.strip().split(u',')[0]) if(answerer_name[i] != u'匿名用户'): answerer_id.append(re.findall('href="/people/(.*)"',str(a[i]))[0]) else: answerer_id.append('anonymous') for i in range(len(answerer_name)): print "正在抓取ta的赞同者..\n" print answerer_name[i],data_aid[i] print '\n' Answer_Full_Vote_Info(question_id, answerer_name[i], data_aid[i])
def http(url): html = requests.get(url).text soup_main = BeautifulSoup(html) # "一个"的文字 div = soup_main.find_all("div", {"class": "fp-one-cita"}) text = div[0].a.text # print(text) # “一个”的图片地址 img_list = soup_main.find_all("img", {"class": "fp-one-imagen"}) imgUrl = img_list[0].get('src') # print(imgUrl) # "一个"的标题 title_list = soup_main.find_all("p", {"class": "titulo"}) title = str(title_list[0].text) print(title) # “一个”的文章vol.1132#articulo' url_stroy = 'http://wufazhuce.com/one/' + title + '#articulo' soup_stroy = BeautifulSoup(requests.get(url_stroy).text) stroy_content = str(soup_stroy.find("div", {"class": "articulo-contenido"})) stroy_title = str(soup_stroy.find("h2", {"class": "articulo-titulo"})) stroy = stroy_title + stroy_content for addr in to_addr: sendEmail(text, imgUrl, title, stroy, addr)
def structure_comments(verse): def clean_string(value): value = value.replace(u'\n', u' ') value = value.replace(u'\xb6', u'') value = value.replace(u'\u2283', u'') value = value.replace(u'\2282', u'') value = value.replace(u'\u0259', u'e') value = value.replace(u'[ ', u'[') value = value.replace(u' ]', u']') value = re.sub(u'G ?OD', u'God', value) value = re.sub(u' +', u' ', value) value = value.rstrip() value = value.lstrip() return value verse = re.sub(u'\[.*?\]', u'', verse, 1) soup = BeautifulSoup(u'<root>{}</root>'.format(verse), 'xml') # destroy empty b tags for element in soup.find_all(lambda x: x.name == 'b' and (x.is_empty_element or x.text.isspace())): element.decompose() for element in soup.find_all('xref'): element.decompose() for element in soup.find_all('small'): element.unwrap() for element in soup.find_all('sup'): element.unwrap() verse = clean_string(u' '.join(unicode(child) for child in soup.root.children)) comments, start_index = [], 0 for match in re.finditer(ur'\. <b>', verse): comments.append(verse[start_index:match.start()+1]) start_index = match.start() + 2
def listar_animes2(url): codigo_fonte = abrir_url(url).result soup = BeautifulSoup(codigo_fonte) miniaturas = str(soup.find_all('div', class_='epsBoxImg')) match = re.compile(r'<a href="(.+?)" title="(.+?)">').findall(miniaturas) img = re.compile(r'<img alt=".+?" src="(.+?)" title=".+?"/>').findall(miniaturas) # Obter quantidade de paginas paginacao = str(soup.find_all('div', class_='paginacao')) match_pag = re.compile(r'<a href="(.+?)" title="(.+?)">').findall(paginacao) a = [] for x in range(0, len(match)): temp = [match[x][0], match[x][1], img[x]] a.append(temp) total = len(a) for url2, titulo, img in a: if titulo.endswith("Online"): addDir(titulo[0:len(titulo) - 6], url2, 5, img, False, total) else: addDir(titulo, url2, 5, img, False, total) try: n = re.search(r'http://www.superanimes.com/.+?\?&pagina=(.?)', url).group(1) except: url = url + '?&pagina=1' n = 1 n = int(n) if n <= len(match_pag): m = n+1 prox_pag = url.replace(str(n), str(m)) addDir('Proxima Pagina >>>', prox_pag, 4, artfolder + 'destaques.png')
def get_weekly_menu(): r = requests.post(URL, data=payload) soup = BeautifulSoup(r.text) titles = soup.find_all("div", class_="title") menu_table = soup.find_all("table", class_="menu_table") children = list(titles[0].children) def clean(t): return t.strip(" -") structure = odict([("restaurant", clean(children[0].text)), ("period", clean(children[1])), ("menu", odict())]) menu = structure.get("menu") for tr in menu_table[0].find_all("tr"): for td in tr.find_all("td"): classes = td.attrs.get("class") if classes is None: continue if "day" in classes: day_name = td.h4.text day = [] menu[day_name] = day if "meal" in classes: meal = {"name": clean(td.text)} day.append(meal) if "price" in classes: price = td.text if price and len(price): meal["price"] = price + "€" else: meal["price"] = None return structure
def headliner(url): soup = BeautifulSoup((open(url)), "lxml") head1 = soup.find_all(['h1','h2','h3']) body = soup.find_all('p') head1_fixed = str(head1) soup1 = BeautifulSoup(head1_fixed, 'lxml') gold = soup1.text.decode("unicode-escape").encode("utf-8") body_fixed = str(body) soup_gold = BeautifulSoup(body_fixed, 'lxml') gold_body = soup_gold.text.decode("unicode-escape").encode("utf-8") print gold print "" print gold_body #print gold[0].get_text() #print head1[1].get_text() #print head2[2].get_text() #print head2 #print head3 print ""
soupBATTING = BeautifulSoup(pageBATTING.text, 'html.parser') #Arguments that will be passed into our .find and .find_all attrs = {'attribute1Name': 'attribute1Value', 'attribute2Name': 'attribute2Value'} #Takes the five column headers. Only need to find it once #since it appears five times on the ESPN page. headers = soupBATTING.find('tr', attrs={'class': 'colhead'}) columns = [col.get_text() for col in headers.find_all('td')] #Using pandas to create an empty data frame to store player stats final_df = pd.DataFrame(columns=columns) #Use re's compile function to scrape all player data players = soupBATTING.find_all('tr', attrs={'class':re.compile('row player-10-')}) for player in players: #Gets player's stats stats = [stat.get_text() for stat in player.find_all('td')] #Temporary dataframe for a single player's stats temp_df = pd.DataFrame(stats).transpose() temp_df.columns = columns #Put player stats in the final dataframe final_df = pd.concat([final_df, temp_df], ignore_index=True) print(final_df) # Export to csv file displaying all 331 batters and their stats final_df.to_csv(r"C:\Users\Zach Patrignani\Desktop\mlb_stats.csv", index = False,
res = ss.post(itesturl, headers=headers, data=data) resJ = json.loads(res.text) if resJ['code'] == 9 or resJ['code'] == 10: print('用户名或密码或验证码错误. 程序退出.') os.system("pause") os._exit(0) # 获取当前的测试列表 res_class = ss.get(classurl) # 使用soup解析网页 bs = BeautifulSoup(res_class.text, 'lxml') # 提取h2标签 lst = bs.find_all('h2') print('当前任务: \n') # 该列表记录着所有的作业列表以及可用状态 # 每行包含三个元素,分别是: id, 是否可以查看答案, 以及是否正在考试, 科目名 # 没错,考试可以多个同时进行 tasklist = [] # 编号 no = 0 # 对于每个考试,输出信息 for each in lst: par = each.parent dd = par.find_all('dd')
# beautiful soup wikipedia # 위키피디아 예제 무작정 따라하기 # Code example copy&paste #!/usr/bin/env python3 # Anchor extraction from HTML document # import library from bs4 import BeautifulSoup from urllib.request import urlopen # urlopen : 페이지를 열어준다, response라는 곳으로 담겠다 with urlopen('https://en.wikipedia.org/wiki/Main_Page') as response: soup = BeautifulSoup(response, 'html.parser') for anchor in soup.find_all('a'): print(anchor.get('href', '/')) # with ~ as ~ : 파이썬 문법 # with as 구문은 다음과 같이 직관적으로 쓸 수도 있다 # responose = urlopen('https://en.wikipedia.org/wiki/Main_Page') # soup = BeautifulSoup(response, 'html.parser') # for anchor in soup.find_all('a'): # print(anchor.get('href', '/')) # BeautifulSoup() 함수를 사용, response를 넣어주고 html.parser를 이용하여 구문을 분석한 것을 # soup 이라는 변수에 담아준다 # for문을 사용하여 soup 안에 있는 'a' Tag를 찾아서 # anchor 라는 변수에 넣는다 # for문을 통해 하나씩 가져온 anchor 안에 a 태그의 'href' 참조 주소를 가져와서 print
# 태그 선택자 이용해 한번에 가져오기 html = """ <html><body> <ul> <li><a href="http://www.naver.com">naver</a></li> <li><a href="http://www.daum.net">daum</a></li> <li><a href="http://www.daum.com">daum</a></li> <li><a href="http://www.google.com">google</a></li> <li><a href="http://www.tistory.com">tistory</a></li> </ul> </body></html> """ soup = BeautifulSoup(html, 'html.parser') links = soup.find_all("a") print('links', type(links)) # links <class 'bs4.element.ResultSet'> a = soup.find_all("a", string="daum") print('a', a) # a [<a href="http://www.daum.net">daum</a>, <a href="http://www.daum.com">daum</a>] b = soup.find("a") print('b', b) # b <a href="http://www.naver.com">naver</a> 가장 상위 하나 가져옴. c = soup.find_all("a", limit=3) print('c', c) # c [<a href="http://www.naver.com">naver</a>, <a href="http://www.daum.net">daum</a>, <a href="http://www.daum.com">daum</a>] d = soup.find_all(string=["naver", "google"]) print('d', d) # d ['naver', 'google'] 해당 내용을 찾아옴. but 보통 이렇게 쓰지 않음. print('d', type(d))
def daily_task(): global DATE DATE = str(datetime.date.today()) browser = webdriver.Chrome(executable_path=CHROME_DRIVER, chrome_options=OPTIONS) # browser = webdriver.Chrome() browser.set_window_position(400, 40) browser.set_window_size(1300, 1024) browser.get(BASE_URL) soup = BeautifulSoup(browser.page_source, 'lxml') urls = [] main_category_list = soup.find('ul', id='main-smart-menu').find_all( 'li', class_='menu-item') write_html(browser.page_source, "All_cat_") k = 1 for main_item in main_category_list: if k >= 12: break href = BASE_URL + main_item.find('a').get('href') browser.get(href) soup = BeautifulSoup(browser.page_source, 'lxml') category_list = soup.find('ul', class_='cat-tree-nav').find_all( 'li', class_='cat-tree-item') for item in category_list: url = BASE_URL + item.find('a').get('href') urls.append(url) k += 1 # cat-tree-nav j = 0 while j < len(urls): print('Scraping', urls[j]) browser.get(urls[j]) soup = BeautifulSoup(browser.page_source, 'lxml') category_titles = soup.find('ol', class_='breadcrum') if category_titles is None: category = None sub_category = None else: category_titles = category_titles.find_all('li', class_='breadcrum-item') if len(category_titles) == 2: category = category_titles[1].find('a').text.strip() sub_category = None if len(category_titles) == 3: category = category_titles[1].find('a').text.strip() sub_category = category_titles[2].find('a').text.strip() if len(category_titles) == 4: category = category_titles[1].find('a').text.strip() sub_category = category_titles[2].find('a').text.strip() # print(page_count) i = 0 local_title = 5 while i < int(local_title): soup = BeautifulSoup(browser.page_source, 'lxml') if i != 0: browser.get(urls[j] + "?p=" + str(i + 1)) soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find_all('div', class_='product-catalog-item') if i == 0: soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find_all('div', class_='product-catalog-item') # print(len(list)) # print(i+1) for item in list: item_id = item.get('data-pid').strip() # item_id = item_id.split('id=')[1] # Vietnamese # English if item.find('div', class_='name-brand') != None: brand = item.find('div', class_='name-brand').text.strip() else: brand = None if item.find('div', class_='home-product-title') != None: title_Vietnamese = item.find( 'div', class_='home-product-title').text.strip() else: title_Vietnamese = None # if item.find('div', class_='english_name') != None: # title_English = item.find('div', class_='english_name').text.strip() # else: # title_English = None # print("Title: " + title) if item.find('span', class_='list-product-meta-price') != None: price = item.find( 'span', class_='list-product-meta-price').text.strip() price = price.split('đ')[0] price = price.strip() else: price = None # print("Price: " + str(price)) if item.find('span', class_='list-product-old-price') != None: old_price = item.find( 'span', class_='list-product-old-price').text.strip() old_price = old_price.split('đ')[0] old_price = old_price.strip() else: old_price = None date = DATE data = { 'category': category, 'sub_category': sub_category, 'id': item_id, 'good_name': title_Vietnamese, 'brand': brand, 'price': price, 'old_price': old_price, 'date': date } write_csv(data) file_name = str(j + 1) + "_" + str(i + 1) + "_" write_html(browser.page_source, file_name) soup = BeautifulSoup(browser.page_source, 'lxml') if soup.find('div', class_='list-pagination') == None: break page_count = soup.find('div', class_='list-pagination').find_all('a') local_title = browser.find_element_by_xpath( '//*[@id="_products"]/div/div[2]/div/a[' + str(len(page_count)) + ']') local_title = local_title.get_attribute('title') local_title = local_title.split('Xem trang')[1] local_title = local_title.strip() i += 1 j += 1 # Close browser browser.close() browser.service.process.send_signal(signal.SIGTERM) browser.quit() compress_data()
def queryListRequester(): #First few conditionals check or make the proper files needed for operation. searchSetResults = [] if queryListCheck() == False: print("\n No queries.pickle file found in the CWD.\n Please create one with the query list editor.\n") #mainMenu(False) else: with open("queries.pickle", "rb") as fp: queries = pickle.load(fp) #If there is no rotating search terms list set up yet, this will make and save the empty list of lists needed to start it. #A list of lists is needed so each list can correspond to a query. if not rotatingPageImagesCheck(): x = [] for i in range(len(queries)): x.append([]) with open("query-results-rotating-list.pickle", "wb") as fp: pickle.dump(x, fp) with open("query-results-rotating-list.pickle", "rb") as fp: savedResultsRotation = pickle.load(fp) #This is very important, will delete rotating search result images if there is a difference in length between the rotating #list and the query length, the only implimented way to tell if the query list has changed. if len(savedResultsRotation) != len(queries): os.remove("query-results-rotating-list.pickle") queryListRequester() print("\n Connecting, parsing and pickling...\n") for i in range(len(queries)): #Based on whether this program was called by cron or not, each search will be staggered by between 0 seconds and 7 seconds #as to make it more difficult to detect a regular pattern of search, preventing detection and blocking of the bot. #The google search is created with each query being placed inside. google_url = "https://www.google.com/search?q={" + queries[i] + "}&num=lnms" #Try block used in case of connection failure, depending on whether the program is being called by cron or not, it will #fail out to the menu or not. tryCount = 0 response = None while response is None: try: #The response to the search query from google is taken. The custom user agents are used so the website doesnt realize the #itsbeing scraped. response = requests.get(google_url, {"User-Agent": userAgents[random.randint(0,(len(userAgents)-1))]}) except: if tryCount == 5: print("\nConnection failed.\n") #else: #mainMenu(False) #print("\n Connection issues...") #tryCount += 1 pass #Beautiful soup gets the html from the response. html = BeautifulSoup(response.text, "html.parser") result_div = html.find_all('div', attrs={'class': 'ZINbbc'}) links = [] titles = [] descriptions = [] #Parses the html of google search page into lists of the links, titles, and descriptions of the query search results. for r in result_div: #Checks if each element is present, else, raise exception. try: link = r.find('a', href=True) title = r.find('div', attrs={'class': 'vvjwJb'}).get_text() description = r.find('div', attrs={'class': 's3v9rd'}).get_text() #Makes sure everything is present before appending. if link != '' and title != '' and description != '': links.append(link['href']) titles.append(title) descriptions.append(description) #Goes to next loop if one element is not present. except: continue savedResultsRotation[i].append(titles) searchResults = [links, titles, descriptions] searchSetResults.append(searchResults) with open("query-results.pickle", "wb") as fp: pickle.dump(searchSetResults, fp) #This shows the rotating lifo search list's rotation schedule. Here we have the line "if len(savedResultsRotation[0]) == 8:" #Each time this function is called a new set of titles from the searches is added to this list. Ideally, when comparing #our new search against our old ones, we will be comparing against the search results of the previous 8 searches. #It is important to rotate our search results like this, because many search results may go in and out of the search #pages remaining a popular topic, we want to know about results that are new, so we compare against lots of old. if len(savedResultsRotation[0]) == 8: for i in range(len(savedResultsRotation)): #Deletes oldest list of search results once 8 search results is met. savedResultsRotation[i].pop(0) with open("query-results-rotating-list.pickle", "wb") as fp: pickle.dump(savedResultsRotation, fp) #if mainMenuAfter == True: #mainMenu(False) print("done") return searchSetResults
# Il y a 42 pages de communiqués, et la première page est la page 0. Ainsi, j'ai créé une liste de nombre allant de 0 à 41. C'est pour cette raison qu'elle se finit par 42. pages = list(range(0,42)) # Création de ma première boucle. Ici je viens créer un lien pour chaque page consultée. for page in pages: urlpage = url + str(page) # print(urlpage) # Création de requêtes pour aller chercher les 42 pages. sites = requests.get(urlpage, headers=entetes) pages2 = BeautifulSoup(sites.text, "html5lib") articles = pages2.find_all("li", class_="search-result") for article in articles: # Je vais chercher la date pour aller l'inclure dans mon fichier CSV. date = article.find("span", class_="search-result-date").text.strip() # Je vais également inclure le titre du communiqué. titrecommunique = article.find("a", class_="search-result-title").text.strip() listesujets = [] listesujets.append(date) listesujets.append(titrecommunique) # Parce que les urls que l'on retrouve dans le code source ne sont que la fin de l'URL requis, j'ai pris la première partie de l'URL pour par la suite inclure la partie retrouvée sur le code. urldebut = "https://lautorite.qc.ca" urlfin = article.find("a", class_="search-result-title")["href"] urlfinal = urldebut + urlfin # Ajout de l'URL complet dans ma liste. listesujets.append(urlfinal)
def get_github_info(url="", title="", ts="", tag="", max_redirects=30, proxy=None, root_dir="data/githubcom", isnew=False, retry=3, timeout=10): """ github解析 :param url: :param max_redirects: :param proxy: :param root_dir: :return: """ file_404 = path("data/github_404") urls_404 = set() if os.path.exists(file_404): with codecs.open(file_404, mode='rb') as fr: for line in fr: line = line.strip() if line: urls_404.add(line) pattern = "(https://github.com/([^/]+))" match = re.search(pattern, url) overview = {} overview['title'] = strip_n(title) overview["url"] = url overview['ts'] = ts overview['tag'] = tag if match: url_root, github_id = match.groups() overview["github_id"] = github_id if url_root in urls_404: return else: return root_dir = path(root_dir) if not os.path.exists(root_dir): os.mkdir(root_dir) hl = hashlib.md5() hl.update(url.encode(encoding='utf-8')) fname = path(root_dir, "%s.html" % hl.hexdigest()) if isnew or not os.path.exists(fname): get_request(url_root, proxy=proxy, fname=fname, fname_404=file_404, retry=retry, timeout=timeout, max_redirects=max_redirects) if os.path.exists(fname): with codecs.open(fname, mode='rb') as fr: try: soup = BeautifulSoup(fr, 'lxml') except Exception as e: logging.error("GET title of %s failed : %s" % (url, repr(e))) return # 1. find org-description org_sub = soup.find("p", class_='org-description') if org_sub: org_sub = org_sub.next_sibling if org_sub: org_sub = org_sub.get_text() org_sub = strip_n(org_sub) overview["org_profile"] = org_sub # 2. find geo and url org_meta = soup.find("ul", class_=re.compile("org-header-meta")) org_url = None org_geo = None if org_meta: org_url = org_meta.find("a") if org_url: org_url = strip_n(org_url.get("href")) org_geo = org_meta.find("li", class_=re.compile('meta-item v-align-middle')) if org_geo: org_geo = strip_n(org_geo.get_text()) overview["org_url"] = org_url overview["org_geo"] = org_geo # 3. repos#people#project for aa in soup.find_all("a", class_=re.compile(r'pagehead-tabs-item')): aa = aa.get_text() aa = strip_n(aa) if aa: parts = re.split("\s+", aa) if len(parts) == 2: t = re.sub(',', '.', parts[1]) p = re.match('(\d+\.*\d*)([km]*)', t) if p: n, d = p.groups() if d == 'k': t = float(n) * 1000 elif d == 'm': t = float(n) * 1000000 else: pos = n.find('.') if pos != -1: t = int(float(n) * pow(10, len(n[n.find('.') + 1:]))) overview["org_%s" % parts[0].lower()] = t # 4. star forks overview["repo_star"] = 0 overview["repo_forks"] = 0 repo_language = set() # repo for aa in soup.find_all('span', class_=re.compile('repo-language-color')): aa_p = aa.parent if aa_p: aa_p = strip_n(aa_p.get_text()) if aa_p: p = re.split(r'\s+', aa_p) if len(p) > 0: repo_language.add(p[0]) if repo_language: repo_language = ",".join(repo_language) else: repo_language = "" overview["repo_lang"] = repo_language for aa in soup.find_all("a", class_="pinned-item-meta muted-link"): t = strip_n(aa.get_text()) if re.match('^\d+$', t): t = int(t) else: t = re.sub(',', '.', t) p = re.match('(\d+\.*\d*)([km])', t) if p: n, d = p.groups() if d == 'k': t = float(n) * 1000 elif d == 'm': t = float(n) * 1000000 else: t = int(n) else: continue star = aa.find("svg", class_="octicon octicon-star") if star: if t > overview["repo_star"]: overview["repo_star"] = t else: forks = aa.find("svg", class_="octicon octicon-repo-forked") if forks: if t > overview["repo_forks"]: overview["repo_forks"] = t # 5. languages overview["org_lang"] = [] for aa in soup.find_all("a", class_="no-wrap text-gray d-inline-block muted-link mt-2"): t = strip_n(aa.get_text()) overview["org_lang"].append(t) overview["org_lang"] = ",".join(overview["org_lang"]) overview["github_type"] = 1 # 1: org 0:private if not (org_sub or org_geo or org_url): overview["github_type"] = 0 # repos #stars #followers#following for aa in soup.find_all("a", class_=re.compile('UnderlineNav-item')): aa = aa.get_text() aa = strip_n(aa) if aa: parts = re.split("\s+", aa) if len(parts) == 2: t = re.sub(',', '.', parts[1]) p = re.match('(\d+\.*\d*)([km]*)', t) if p: n, d = p.groups() if d == 'k': t = int(float(n) * 1000) elif d == 'm': t = int(float(n) * 1000000) else: t = int(n) overview["p_%s" % parts[0].lower()] = t # 个人简介 p_profile = soup.find("div", class_=re.compile("p-note user-profile-bio")) if p_profile: p_profile = strip_n(p_profile.get_text()) overview["p_profile"] = p_profile # 公司 p_company = soup.find("span", class_=re.compile("p-org")) if p_company: p_company = strip_n(p_company.get_text()) overview["p_company"] = p_company # 地理位置 p_loc = soup.find("span", class_=re.compile("p-label")) if p_loc: p_loc = strip_n(p_loc.get_text()) overview["p_loc"] = p_loc # url p_url = soup.find("div", class_=re.compile('js-profile-editable-area')) if p_url: p_url = p_url.find_all("a") if p_url: for p_url_i in p_url: p_url_i = p_url_i.get("href") if p_url_i.startswith("http"): p_url = p_url_i break else: p_url = "" if not p_url: p_url = None overview["p_url"] = strip_n(p_url) # organizations github_org = soup.find("a", class_='avatar-group-item') if github_org: github_org = github_org.get("href") overview["p_github_org"] = strip_n(github_org) return overview
def scrape(): browser = init_browser() listings = {} url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, "html.parser") result = soup.find('li', class_='slide') # news_title = result.find(class_='content_title').text listings["news_title"] = result.find(class_='content_title').get_text() listings["news_description"] = result.find(class_= 'rollover_description_inner').get_text() image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/' browser.visit(image_url) time.sleep(3) image_html = browser.html soup = BeautifulSoup(image_html, 'html.parser') carousel = soup.find(class_='carousel_items') listings["featured_image_url"] = carousel.find(class_= 'button fancybox')['data-fancybox-href'] weather_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(weather_url) time.sleep(3) weather_html = browser.html soup = BeautifulSoup(weather_html, 'html.parser') content = soup.find(class_='content') listings["mars_weather"] = content.find('p', class_ = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text mars_facts_url = 'https://space-facts.com/mars/' mars_facts = {} mars_facts = pd.read_html(mars_facts_url)[1] mars_facts_html = mars_facts.to_html() listings["mars_facts"] = mars_facts_html # # Mars Hemispheres hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemisphere_url) time.sleep(3) hemisphere_html = browser.html soup = BeautifulSoup(hemisphere_html, 'html.parser') image_links = soup.find_all('div', class_='description') full_urls = [] full_links = [] for image in image_links: image_url = image.find('a')['href'] full_url = f'https://astrogeology.usgs.gov{image_url}' full_urls.append(full_url) browser.visit(full_url) time.sleep(3) image_html = browser.html soup = BeautifulSoup(image_html, 'html.parser') full_link = soup.find('img', class_='wide-image')['src'] full_links.append(f'https://astrogeology.usgs.gov{full_link}') listings["full_url"] = full_urls listings["full_image_link"] = full_links print(listings) return listings
import requests from bs4 import BeautifulSoup from csv import writer source = requests.get("https://www.hackerearth.com/companies/") soup = BeautifulSoup(source.text, "html.parser") el = soup.find_all(class_='company-card-container') #csv writer with open( '/home/chiraghs/my_codes/Web_scraping/Company_names with links/names.csv', 'w') as f: elem = writer(f) headeres = ['Title', 'Location ', 'Link'] elem.writerow(headeres) #get items or data for item in el: # print(item) print("\n") title = item.find(class_='name ellipsis').get_text().replace('\n', "") link = item['link'] print(title + " : " + link) elem.writerow([title, link])
def get_text(url): page = urlopen(url) soup = BeautifulSoup(page) fetched_text = ' '.join(map(lambda p: p.text, soup.find_all('p'))) return fetched_text
def run(self): if self.UI.username.text() == '': self.UI.log.append('لطفا ایمیل خود را وارد کنید') return if self.UI.password.text() == '': self.UI.log.append('لطفا پسورد خود را وارد کنید') return self.UI.log.append('شروع') def dkprice_to_numbers(dkprice): '''gets something like ۱۱۷،۰۰۰ تومان and returns 117000''' convert_dict = {u'۱': '1', u'۲': '2', u'۳': '3', u'۴': '4', u'۵': '5', u'۶': '6', u'۷': '7', u'۸': '8', u'۹': '9', u'۰': '0', } price = u'۰' + dkprice for k in convert_dict.keys(): price = re.sub(k, convert_dict[k], price) price = re.sub('[^0-9]', '', price) return int(price) def extract_data(one_page, all_orders, all_post_prices): soup = BeautifulSoup(one_page.text, 'html.parser') # there might be more than one table for this_table in soup.find_all('div', class_='c-table-order__body'): for this_item in this_table.find_all('div', class_='c-table-order__row'): name = this_item.find('span').get_text() dknum = this_item.find( 'div', class_='c-table-order__cell--value').get_text() num = dkprice_to_numbers(dknum) dkprice = this_item.find( 'div', class_='c-table-order__cell--price-value').get_text() price = dkprice_to_numbers(dkprice) dkdiscount = this_item.find( 'div', class_='c-table-order__cell c-table-order__cell--discount').get_text() discount = dkprice_to_numbers(dkdiscount) date = soup.find('h4').span.get_text() date = re.sub(u'ثبت شده در تاریخ ', '', date) all_orders.append((date, name, num, price, discount)) dkpost_price = soup.find_all('div', class_='c-table-draught__col')[3].get_text() post_price = dkprice_to_numbers(dkpost_price) all_post_prices.append(post_price) self.UI.log.append('تلاش برای ورود') url = 'https://www.digikala.com/users/login/' payload = {'login[email_phone]': self.UI.username.text(), 'login[password]': self.UI.password.text(), 'remember': 1} session = requests.session() r = session.post(url, data=payload) if r.status_code != 200: self.UI.log.append('مشکل در اتصال. کد خطا: %s' % r.status_code) return successful_login_text = 'سفارشهای من' if re.search(successful_login_text, r.text): self.UI.log.append('لاگین موفق') else: self.UI.log.append('کلمه عبور یا نام کاربری اشتباه است') return page_number = 1 orders = session.get( 'https://www.digikala.com/profile/orders/?page=%i' % page_number) soup = BeautifulSoup(orders.text, 'html.parser') all_orders = [] # (list of (date, name, number, item_price)) all_post_prices = [] # list of post prices while not soup.find('div', class_='c-profile-empty'): for this_order in soup.find_all('a', class_='btn-order-more'): this_order_link = this_order.get('href') print('going to fetch: http://digikala.com' + this_order_link) one_page = session.get('http://digikala.com' + this_order_link) extract_data(one_page, all_orders, all_post_prices) self.UI.log.append('بررسی صفحه %i' % page_number) page_number += 1 orders = session.get( 'https://www.digikala.com/profile/orders/?page=%i' % page_number) soup = BeautifulSoup(orders.text, 'html.parser') self.UI.log.append('پایان') total_price = 0 total_purchase = 0 full_purchase_list = '' n = 0 total_post_price = 0 total_discount = 0 self.UI.output_general.setRowCount(len(all_orders)) for date, name, num, price, discount in all_orders: this_purchase_str = "تاریخ %s: %s عدد %s, به قیمت هر واحد %s\n" % ( date, num, name, price) full_purchase_list = this_purchase_str + full_purchase_list this_product_total_price = (price * num) - discount total_price += this_product_total_price total_purchase += 1 total_discount += discount self.UI.output_general.setItem(n, 0, QTableWidgetItem(str(date))) self.UI.output_general.setItem(n, 1, QTableWidgetItem(str(num))) self.UI.output_general.setItem(n, 2, QTableWidgetItem(str(this_product_total_price))) self.UI.output_general.setItem(n, 3, QTableWidgetItem(str(discount))) self.UI.output_general.setItem(n, 4, QTableWidgetItem(str(name))) n = n + 1 purchase_count = len(all_post_prices) for post_price in all_post_prices: total_post_price += post_price self.UI.output_result.clear() price_item = ['کل خرید شما از دیجی کالا: {} تومان'.format(total_price)] total_post_price_item = ['مجموع هزینه ی پست: {} تومان'.format(total_post_price)] total_discount_item = ['مجموع تخفیفات دریافتی: {} تومان'.format(total_discount)] purchase_item = ['تعداد خرید: {} قطعه'.format(total_purchase)] purchase_count_item = ['دفعات خرید: {} بار'.format(purchase_count)] self.UI.output_result.addItems(price_item) self.UI.output_result.addItems(total_post_price_item) self.UI.output_result.addItems(total_discount_item) self.UI.output_result.addItems(purchase_item) self.UI.output_result.addItems(purchase_count_item)
# doc_to_docx(file_path); if not os.path.isdir(file_path) and os.path.splitext(file_path)[1] == '.docx' \ and file[0:2] != '~$' and file[0:2] != '.~': filepaths.append(file_path) # print(oldnames) if not os.path.exists(path + 'result/'): os.mkdir(path + 'result/') for filepath in filepaths: document = ZipFile(filepath) xml = document.read("word/document.xml") wordObj = BeautifulSoup(xml.decode("utf-8")) # 插入:w:ins,删除:w:del,移动 :w:moveFrom,w:moveTo,设置格式:w:rFonts,批注:w:commentRangeEnd paras = wordObj.find_all("w:p") oldfilename = filepath.split('/')[-1] first = oldfilename.split('_')[0] firstA = oldfilename.split('_')[0] + '_修改前后对照表.docx' # 存入的文档 doc = docx.Document() # 标题 p = doc.add_paragraph() p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER run = p.add_run('《' + first + '》' + '\n' + '修订前后对照表') run.font.name = '宋体' run.font._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') run.font.size = docx.shared.Pt(14)
minor_list = [] data = soup.find('div', attrs={'class': 'bodyContent'}) for a in data.find_all('a', href=True): link = (r'http://www.cushmanwakefield.us' + a['href']) minor_list.append(link) print(minor_list) #list of links to scrape by city for item in minor_list: browser2 = webdriver.Chrome() browser2.get(item) HTML2 = browser2.execute_script("return document.body.innerHTML") soup2 = BeautifulSoup(HTML2, 'html.parser') data = soup2.find('div', attrs={'class': 'm-box highlight lightGrey'}) for link in soup2.find_all('a', href=True): href = link['href'] if any(href.endswith(x) for x in ['.pdf']): print(href) file_name = href.split('/')[-1] print(file_name) print(file_name) remote_file = requests.get(href) os.makedirs(os.path.join(baseDir, query), exist_ok=True) with open(os.path.join(baseDir,query,file_name), 'wb') as f: for chunk in remote_file.iter_content(chunk_size=1024): if chunk: f.write(chunk) print('saved: ' + href)
__author__ = 'wilsonincs' """Display the price and date at closing for the apple stock""" import urllib2 from bs4 import BeautifulSoup import pprint as p url = "http://finance.yahoo.com/q/hp?s=AAPL+Historical+Prices" page = urllib2.urlopen(url) soup = BeautifulSoup(page.read()) stock = soup.find_all('tr') closing_data = [] for i in stock: if len(i.find_all('td', { 'class': 'yfnc_tabledata1', 'align': 'right' })) == 7: date = i.contents[0].get_text() close = i.contents[6].get_text() closing_data.append((date, close)) p.pprint(closing_data)
# -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup r = requests.get("http://www.onlinedown.net/hits/windows/2/") soup = BeautifulSoup(r.text, 'lxml') # print(soup, type(soup)) print (len(soup.find_all("a", "title"))) for a in soup.find_all("a", "title"): print (a.text) print (len(soup.find_all("span", "size"))) for a in soup.find_all("span", "size"): print(a.text) print (len(soup.find_all("span", "lan"))) for a in soup.find_all("span", "lan"): print(a.text) print (len(soup.find_all("span", "pop"))) for a in soup.find_all("span", "pop"): print(a.text) print (len(soup.find_all("span", "dro"))) for a in soup.find_all("span", "dro"): print(a.text) print (len(soup.find_all("span", "time"))) for a in soup.find_all("span", "time"):
def run(self) : display = Display(visible=0 , size=(1024,768)) display.start() browser = webdriver.Firefox() requests.adapters.DEFAULT_RETRIES = 5 headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"} is_continue = True retries = 0 page_html = "" while is_continue : response = None try : print(self.home_page) response = requests.get(self.home_page , timeout=10 , headers=headers) page_html = response.text if "" != page_html : break except Exception as e : logger.error(str(e)) retries += 1 logger.error("Retry: " + str(retries)) if 5 < retries : break else : time.sleep(5) continue soup = BeautifulSoup(str(page_html) , "lxml") table_match = soup.find_all(name="tbody") table_soup = BeautifulSoup(str(table_match) , "lxml") items_match = table_soup.find_all(name="tr") proxy_info = [] print(page_html) for item in items_match : """ ip_rex = ">\s*(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*</span>" port_rex = ">\s*(\d{1,5})\s*</span>" used_time_rex = ">\s*(\d{1,3})天\s*</span>" ip_match = re.findall(re.compile(ip_rex) , str(item)) port_match = re.findall(re.compile(port_rex) , str(item)) used_time_match = re.findall(re.compile(used_time_rex) , str(item)) if 1==len(ip_match) and 1==len(port_match) and 1==len(used_time_match) : proxy_info.append([ip_match[0] , port_match[0] , used_time_match[0]]) else : logger.error("ip_match: " + str(ip_match)) logger.error("port_match: " + str(port_match)) logger.error("used_time_match: " + str(used_time_match)) """ ip_rex = ">\s*(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*</td>" port_rex = ">\s*(\d{1,5})\s*</td>" net_type_rex = ">\s*([A-Z]{1,10})\s*</td>" response_time_rex = ">([0-9]{0,1}\.{0,1}[0-9]{0,1})秒</td>" ip_match = re.findall(re.compile(ip_rex) , str(item)) port_match = re.findall(re.compile(port_rex) , str(item)) net_type_match = re.findall(re.compile(net_type_rex) , str(item)) response_time_match = re.findall(re.compile(response_time_rex) , str(item)) print(ip_match) print(port_match) print(net_type_match) print(response_time_match) sys.exit(1) proxy_info = sorted(proxy_info , key=lambda info:int(info[2]) , reverse=True) check_target_url = [ "https://www.baidu.com/" , "http://www.jd.com/" , "https://www.zhihu.com/" , "http://www.bilibili.com/" , "https://www.taobao.com/" , ] urls_match_proxies = {} for url in check_target_url : url = "http://www.jd.com/" url_matches_proxies = [] for pinfo in proxy_info : net_types = self.checkProxy(pinfo[0] , pinfo[1] , url) if 1 == len(net_types) : url_matches_proxies.append([net_types[0] , pinfo[0] , pinfo[1]]) elif 2 == len(net_types) : url_matches_proxies.append([net_types[0] , net_types[1] , pinfo[0] , pinfo[1]]) else : logger.error("No proxies matched.") continue urls_match_proxies[url] = url_matches_proxies for url,proxies in urls_match_proxies.items() : print(url) for p in proxies : print("\t\t" + str(p))
# https://www.sample.net/business/finance/invoice/ # https://www.freshbooks.com/invoice-templates/pdf #マニュアルのPDFに入るための共通URL "manual/xxx" このXXXで取得するPDFが変わる download_urls = [] BASE_URL = "https://sega.jp/mdmini/manual/" #"tmp_folder"がないことを確認し、"tmp_folder"を作成する if os.path.exists("tmp_folder") == False: os.mkdir("tmp_folder") #サーバからHTML、XMLなどの情報を取得するのに使用 html = requests.get("https://sega.jp/mdmini/manual/index.html") soup = BeautifulSoup(html.text, "lxml") #”a”タグをすべて抽出し、linksというリストを作ります。 links = soup.find_all("a") for link in links: h_ref = link.get("href") if h_ref and ".pdf" in h_ref: download_urls.append(h_ref) for download_url in download_urls: file_name = download_url.split("/")[-1] r = requests.get(BASE_URL + download_url) time.sleep(1) if r.status_code == 200: with open(os.path.join("tmp_folder", file_name), "wb") as f:
class Scrawler(object): """docstring for Scrawler""" def __init__( self, cookies="" ): self.session = requests.session() # self.session.keep_alive = False self.headers = { 'Host': 'www.tianyancha.com', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'https://www.tianyancha.com/', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'cookie': cookie, } self.soup = None self.url_id = None self.proxies = None self.set_proxy() scrawler_num = 0 @classmethod def build_scrawler(cls, cookies): s = cls(cookies[cls.scrawler_num]) cls.scrawler_num += 1 return s def set_proxy(self): url = "http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=3873239366fb4548a227fcbf310862ba&count=1&expiryDate=0&format=1&newLine=2" resp = self.session.get(url) if resp.status_code == 200: resp_json = json.loads(resp.text) proxy_ip = resp_json['msg'][0]['ip'] proxy_port = resp_json['msg'][0]['port'] proxy_meta = "%(ip)s:%(port)s" % { "ip" : proxy_ip, "port" : proxy_port, } proxies = { "http" : "http://" + proxy_meta, "https" : "https://" + proxy_meta, } self.proxies = proxies print("Connect: Set Proxy %s" % proxy_meta) print() else: raise Exception # def set_cookie(self): # self.headers['cookie'] = self.cookies[Scrawler.scrawler_num] # Scrawler.scrawler_num += 1 # if Scrawler.scrawler_num >= len(self.cookies): # Scrawler.scrawler_num = 0 def set_cookie(self, cookie): self.headers['cookie'] = cookie def get_current_ip(self): url = "https://httpbin.org/ip" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } resp = requests.get(url, headers=headers, proxies=self.proxies, timeout=10) print("Connect: Current IP %s" % resp.text.strip()) return resp def req_get(self, url): resp = requests.get(url, headers=self.headers, proxies=self.proxies, timeout=10) return resp def parse_urls(self, page_no): if page_no == 1: url = "https://www.tianyancha.com/search?key=%E7%A7%91%E6%8A%80%E9%87%91%E8%9E%8D" else: url = "https://www.tianyancha.com/search/p" + str(page_no) + "?key=%E7%A7%91%E6%8A%80%E9%87%91%E8%9E%8D" resp = self.session.get(url) urls_soup = BeautifulSoup(resp.content, "html5lib") urls = [] url_list = urls_soup.find_all('div', attrs={'class': 'search_result_single'}) for li in url_list: url = li.find('a')['href'] urls.append(url) return urls def parse_url_content(self, url, url_id): print('Connect: GET %s' % url) resp = self.req_get(url) print('Parsing: Company Info') print() self.soup = BeautifulSoup(resp.content, "html5lib") self.url_id = url_id def parse_company_info(self): data = [['url_id', 'company_name', 'company_address', 'company_intro', 'company_status']] header = self.soup.find('div', attrs={'class': 'company_header_width'}) company_name = header.h1.get_text() company_info = header.find_all('div', attrs={'class': ['f14', 'sec-c2']}) company_address = company_info[1].contents[1].contents[1].get_text() company_intro = header.find('script', attrs={'id': 'company_base_info_detail'}) if company_intro: company_intro = company_intro.get_text().strip() else: company_intro = '暂无信息' company_info2 = self.soup.find_all('div', attrs={'class': 'baseinfo-module-content-value'}) if company_info2: company_status = company_info2[2].get_text() else: company_status = '暂无信息' data.append([self.url_id, company_name, company_address, company_intro, company_status]) return data def parse_corporate_info(self): data = [['url_id', 'corporate_name', 'company_role', 'company_name', 'company_province', 'company_date', 'company_capital', 'company_status']] corporate_info = self.soup.find('div', attrs={'class': 'human-top'}) if corporate_info and ('human' in corporate_info.a['href']): corporate_info = corporate_info.a corporate_name = corporate_info.get_text() corporate_link = corporate_info['href'] print('Connect: GET %s' % corporate_link) resp = self.req_get(corporate_link) print('Parsing: Corporate Info') print() corporate_soup = BeautifulSoup(resp.content, "html5lib") companies = corporate_soup.find('div', attrs={'id': '_container_syjs'}).table.tbody.find_all('tr') for i in range(0,len(companies)): if companies[i].contents[0].contents[0].get_text(): company_role = companies[i].contents[0].contents[0].get_text() company_name = companies[i].contents[1].contents[1].get_text() company_province = companies[i].contents[2].get_text() company_date = companies[i].contents[3].get_text() company_capital = companies[i].contents[4].get_text() company_status = companies[i].contents[5].get_text() else: company_name = companies[i].contents[0].contents[1].get_text() company_province = companies[i].contents[1].get_text() company_date = companies[i].contents[2].get_text() company_capital = companies[i].contents[3].get_text() company_status = companies[i].contents[4].get_text() data.append([self.url_id, corporate_name, company_role, company_name, company_province, company_date, company_capital, company_status]) else: data.append([self.url_id, '-', '-', '-', '-', '-', '-', '-']) return data def parse_finacing_info(self): data = [['url_id', 'company_name', 'finacing_time', 'turn', 'appraisement', 'capital', 'propertion', 'invenstors']] header = self.soup.find('div', attrs={'class': 'company_header_width'}) company_name = header.h1.get_text() finacing_link = header.contents[2] if finacing_link.contents: finacing_link = finacing_link.a['href'] print('Connect: GET %s' % finacing_link) resp = self.req_get(finacing_link) print('Parsing: Finacing Info') print() finacing_soup = BeautifulSoup(resp.content, "html5lib") finacing_info = finacing_soup.find('div', attrs={'id': '_container_rongzi'}) if finacing_info: finacing_table = finacing_info.tbody.contents for tr in finacing_table: finacing_time = tr.contents[1].get_text() turn = tr.contents[2].get_text() appraisement = tr.contents[3].get_text() capital = tr.contents[4].get_text() propertion = tr.contents[5].get_text() invenstors = tr.contents[6].get_text() data.append([self.url_id, company_name, finacing_time, turn, appraisement, capital, propertion, invenstors]) else: data.append([self.url_id, company_name, '-', '-', '-', '-', '-', '-']) else: data.append([self.url_id, company_name, '-', '-', '-', '-', '-', '-']) return data
from prettytable import PrettyTable def get_one_page(url): response = requests.get(url) if response.status_code == 200: return response.text return None #def main(): list_file = open("movielist",'w') table=PrettyTable(["排行","名称","主演","上映时间","评分"]) for num in range(0,5): url = 'http://maoyan.com/board/4'+'?offset='+str(num*10) html = get_one_page(url) #print(html) soup =BeautifulSoup(html,'lxml') items=soup.find_all('dd') i=0 for item in items: html_page=soup.find_all('dd')[i] index=html_page.find('i',class_='board-index').get_text() name=html_page.find('p',class_='name').a['title'] star=html_page.find('p',class_='star').get_text().strip().strip('主演:') releasetime=html_page.find('p',class_='releasetime').get_text().strip().strip('上映时间:') score=html_page.find('i',class_='integer').get_text() +html_page.find('i',class_='fraction').get_text() list_=str(index)+str(name)+str(star)+str(releasetime)+str(score)+'\n' table.add_row([index,name,star,releasetime,score]) #print(index,name,star,releasetime,score) i=i+1 print(table) list_file.write(str(table))
datalist.clear() listidx = 1 while IsWhileGo: btns = driver.find_elements_by_class_name('summaryBtn') for btn in btns: while True: try: time.sleep(0.5) btn.click() break except: time.sleep(0.5) time.sleep(2) bs4 = BeautifulSoup(driver.page_source, 'lxml') lis = bs4.find('ul', id='jobNormal').find_all('li') smbs = bs4.find_all('div', class_='summaryView') summaryidx = 0 alist = [] alist.clear() titlelist = [] titlelist.clear() for li in lis: title = li.find('span', class_='company').get_text().strip() href = baseurl + li.find('a')['href'] dateday = li.find( 'span', class_='regDate').get_text().split(':')[1].strip() if title not in titlelist: if dateday == '': dateday = nowdate if endDate >= dateday and dateday >= standDate: try:
def forum_topic_fetch(user_agent): """ This funciton will fetch the catagories in the Naturally Curly forum page Keyword arguments: user_agent -- Using a different user agent than the default python one keeps the user from being kicked out by the website """ #Make a get request to retrieve the page html_page = requests.get('https://curltalk.naturallycurly.com/', headers = {'User-Agent': user_agent} ) soup = BeautifulSoup(html_page.content, 'html.parser') link_list_forum_top = [] for link in soup.find_all('a'): link_list_forum_top.append(link.get('href')) categories = [s for s in link_list_forum_top if (("categories" in s) and ('https' in s))] return categories def signature_fetch(categories, index_of_category_list, user_agent, start_range=0, finish_range=100): """ This funciton will fetch the signatures from a forum topic Keyword arguments: catagories -- the list of category urls with no page number on them index_of_category_list -- which url in the list is deisred to be scraped user_agent -- using a different user agent than the default python one keeps the user from being kicked out by the website start_range -- default to start at page zero, but can be set to a different page number as it might need to be run a few times in approximatly 100 page increments finish_range -- default number of pages to stop scraping at, but this might need to be adjusted for very large or small number of pages on each topic url """ # Get a list of specific discussion urls link_listdiscussion = [] for i in range(start_range, finish_range): url = f'{categories[index_of_category_list]}/p{i}' html_page = requests.get(url, headers = {'User-Agent': user_agent} ) # Check status code status = html_page.status_code if status != 200: print(f'Error improper response code. Code is {status}') # Pass the page contents to beautiful soup for parsing soup = BeautifulSoup(html_page.content, 'html.parser') # Create a list of discussions on each forum topic page for link in soup.find_all('a'): link_listdiscussion.append(link.get('href')) topics = [s for s in link_listdiscussion if (("/discussion" in s) and ('https' in s))] # Lets the user see the function is working should take a few minutes to get a result here depending on the range of pages looped through print(len(topics)) # Loop through all the topics found for each catagory list_for_mongo = [] count = 1 for topic in topics: url2 = topic html_page2 = requests.get(url2, headers = {'User-Agent': user_agent} ) # Check status code if status != 200: print(f'Error improper response code. Code is {status}') soup2 = BeautifulSoup(html_page2.content, 'html.parser') signatures = soup2.find_all('div', class_="Signature UserSignature userContent") for i in range(0,len(signatures)): sig = {} sig['signature'] = soup2.find_all('div', class_="Signature UserSignature userContent")[i].get_text() list_for_mongo.append(sig) count += 1 return list_for_mongo
def ScrapTweets(user): print ('[+]' + G + ' Fetching Data {} From Twitter...'.format(user) + '\n'+W) link = "https://twitter.com/" + user r=requests.get(link) if r.status_code == 200: the_client = uReq(link) page_html = the_client.read() the_client.close() f=open('wordlist.txt','r') wordlist=[] predator=[] for i in f: for j in i.split(): wordlist.append(j) f.close() soup = BeautifulSoup(page_html, 'html.parser') try: os.mkdir(user) except: pass f1=open("./{0}/{1}.txt".format(user,user),"w+") try: full_name = soup.find('a', attrs={"class": "ProfileHeaderCard-nameLink u-textInheritColor js-nav"}) f1.write("\nUser Name : " + str(full_name.text)) except: f1.write("\nUser Name -->"+ R +" Not Found") try: user_id = soup.find('b', attrs={"class": "u-linkComplex-target"}) f1.write("\nUser Id : "+str(user_id.text)) except: f1.write("\nUser Id : "+"Not Found") try: decription = soup.find('p', attrs={"class": "ProfileHeaderCard-bio u-dir"}) f1.write("\nDescription : "+str(decription.text)) except: f1.write("\nDecription not provided by the user") try: user_location = soup.find('span', attrs={"class": "ProfileHeaderCard-locationText u-dir"}) f1.write("\nLocation : " + str(user_location.text.strip())) except: f1.write("\nLocation not provided by the user") try: connectivity = soup.find('span', attrs={"class": "ProfileHeaderCard-urlText u-dir"}) title = connectivity.a["title"] f1.write("\nLink provided by the user : "******"\nNo contact link is provided by the user") try: join_date = soup.find('span', attrs={"class": "ProfileHeaderCard-joinDateText js-tooltip u-dir"}) f1.write("\nThe user joined twitter on : " + str(join_date.text)) except: f1.write("\nThe joined date is not provided by the user") try: birth = soup.find('span', attrs={"class": "ProfileHeaderCard-birthdateText u-dir"}) birth_date = birth.span.text f1.write("\nDate of Birth:"+str(birth_date.strip())) except: f1.write("\nBirth Date not provided by the user") try: span_box = soup.findAll('span', attrs={"class": "ProfileNav-value"}) f1.write("\nTotal tweets : " + span_box[0].text) except: f1.write("\nTotal Tweets : Zero") try: f1.write("\nFollowing : " +span_box[1].text) except: f1.write("\nFollowing : Zero") try: f1.write("\nFollowers : " + span_box[2].text) except: f1.write("\nFollowers : Zero") try: f1.write("\nLikes send by him : " + span_box[3].text) except: f1.write("\nLikes send by him : Zero") try: if span_box[4].text != "More ": f1.write("\nNo. of parties he is Subscribed to : " + span_box[4].text) else: f1.write("\nNo. of parties he is Subscribed to : Zero") except: f1.write("\nNo. of parties he is Subscribed to : Zero") f1.write(W) spana = soup.findAll('span', attrs={"class": "ProfileNav-value"}) f1.write("\nTweets by "+ str(user) + " are : ") for tweets in soup.findAll('p', attrs={"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"}): tweets=bs2json().convert(tweets) try: f1.write(tweets['p']['text']) f1.write("\n") except: pass ''' for i in range(len(tweets['p']['a'])-1,len(tweets['p']['a'])): try: #print("http://"+str(tweets['p']['a'][i]['text'])) response = requests.get("http://"+tweets['p']['a'][i]['text']) soup = BeautifulSoup(response.text, 'html.parser') img_tags = soup.find_all('img') s=str(img_tags).split() print(s) quit() except: pass continue ''' for i in range(0,len(tweets['p']['a'])): try: f1.write(str(tweets['p']['a'][i]['s']['text'])+str(tweets['p']['a'][i]['b']['text'])) f1.write("\n") if str(tweets['p']['a'][i]['b']['text']) in wordlist: predator.append(user) print(R+"{} May be a Predator".format(user)+W) except KeyError as e: try: if str(tweets['p']['a'][i]['text']).split()!=[]: f1.write(tweets['p']['a'][i]['text']) f1.write("\n") response = requests.get("http://"+tweets['p']['a'][i]['text']) soup = BeautifulSoup(response.text, 'html.parser') img_tags = soup.find_all('img') s=str(img_tags).split() media=[] for i in s: if "/media/" in i: media.append(i) regex=r'https?:\/\/.*\.(?:png|jpg)' for i in media: matches=re.findall(regex,i)[0] urllib.request.urlretrieve(matches,str(user)+"/"+str(matches[-19:])) else: pass except KeyError as e: pass else: pass f1.write("\n") f1.close() print("Fetched Details are Saved at "+"./{0}/{1}.txt".format(user,user)) if len(predator)>0: print(R+"\nPredator Identity Details:\n") for i in predator: ScrapTweets(i) arr=os.listdir("./{}".format(str(i))) for j in arr: if re.match(r".+\.jpg",j): if imageai("./{}".format(str(i))+"/"+j) == True: print(R+"{} Is a Predator".format(str(i))+W) print("Fetched Details are Saved at "+"./{0}/{1}.txt".format(str(i),str(i))) for i in predator: print("./{0}/{1}.txt".format(i,i)) f=open("./{0}/{1}.txt".format(i,i),'r') message=f.read() f.close() #AutoMail Generated mail(message) else: print(R+"\nUser Profile Details:\n"+W) print("Fetched Details are Saved at "+"./{0}/{1}.txt".format(user,user)) f=open("./{0}/{1}.txt".format(user,user),'r') f.seek(0) message=f.read() f.close() #AutoMail Generated mail(message) elif r.status_code == 404: print(R+"Error: Profile Not Found") exit() else: print(R+"Error: Something Went Wrong") exit()
import bs4, requests from bs4 import BeautifulSoup r = requests.get( "http://pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/") c = r.content soup = BeautifulSoup(c, "html.parser") #print(soup.prettify()) #cached version URL -> http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=10.html all = soup.find_all("div", {"class": "propertyRow"}) x = all[0].find("h4", {"class", "propPrice"}).text.replace("\n", "") page_nr = soup.find_all("a", {"class": "Page"})[-1].text print(page_nr) l = [] for page in range(0, int(page_nr) * 10, 10): base_url = "http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=" print(base_url) r = requests.get(base_url + str(page) + ".html?v=0") c = r.content soup = BeautifulSoup(c, "html.parser") all = soup.find_all("div", {"class": "propertyRow"}) for item in all: d = {} d["Address"] = item.find_all("span", {"class", "propAddressCollapse"})[0].text d["Price"] = item.find("h4", { "class": "propPrice" }).text.replace("\n", "") #oterates thru all the prices d["Locality"] = item.find_all("span", {"class", "propAddressCollapse"})[1].text
from bs4 import BeautifulSoup import requests response = requests.get("https://www.finda.co.kr/savings/p2p-investments") soup = BeautifulSoup(response.text, "html.parser") rates = soup.find_all('li', 'h3', 'span') for rate in rates: print(rate.get_text())
from bs4 import BeautifulSoup soup = BeautifulSoup(open('alice.html'), 'html.parser') print soup.title print soup.title.string print soup.p print soup.p['class'] print soup.find_all('a') print soup.find(id='link3')
answerLocation = os.path.join(fileDir, '../dataset/answers.txt') temp = {} with open(questionLocation, 'r') as input, open('questions-cleaned.txt', 'w') as output: lines = input.readlines() numberOfQuestions = len(lines) counter = 0 for line in lines: counter += 1 temp = eval(line) soup = BeautifulSoup(temp['question'], 'html.parser') # Remove any code sections codeSents = soup.find_all('pre') for codeSent in codeSents: codeSent.extract() # soup.text returns clean text, free from html tags # Cleanup trailing spaces and newline characters cleanText = soup.text.encode('ascii', 'ignore') cleanText = re.sub(r'(\n)+', ' ', cleanText).rstrip() temp.update({'question': cleanText.encode('ascii', 'ignore')}) output.write(str(temp)) if counter != numberOfQuestions: output.write('\n') answerDicts = []
def main(blog_id, dl_path, fast=False): global REFERER REFERER = REFERER.format(ISRABLOG_HOSTNAME, blog_id) logging.info("WORKING_DIR is %s", WORKING_DIR) print("Starting download of blog {} to destination {} (fast={}).".format(blog_id, dl_path, fast)) if not os.path.exists(dl_path): os.makedirs(dl_path) logging.info("Copying %s to %s", INJECT_DIR, dl_path) copy_tree(INJECT_DIR, dl_path) post_ids = [] # Main Page print("Downloading main page...") raw = dl_file(get_url(blog_id=blog_id), get_local_path(dl_path=dl_path)) main_soup = BeautifulSoup(raw, 'html.parser') main_soup = dl_and_replace_external_resources(main_soup, dl_path, fast=fast) # sidebar print("Downloading sidebar page(s)...") for tag in main_soup.find_all('iframe', src=blog_readlist_regex): m = re.match(blog_readlist_regex, tag['src']) raw = dl_file(get_url(blog_id=blog_id, intent='sidebar', ListColumns=m.group(1), SideGroup=m.group(2)), get_local_path(intent='sidebar', dl_path=dl_path, sidebar_cols=m.group(1), sidebar_group=m.group(2))) raw = raw.replace('</body>', '<script type="text/javascript" src="iframeResizer.contentWindow.min.js"></script></body>') # adding iframeResizer soup = BeautifulSoup(raw, 'html.parser') dl_and_replace_external_resources(soup, dl_path, fast=fast) replace_internal_resources(soup, saveTo=get_local_path(intent='sidebar', dl_path=dl_path, sidebar_cols=m.group(1), sidebar_group=m.group(2))) # Board List print("Downloading board list...") pagenum = 1 while True: raw = dl_file(get_url(blog_id=blog_id, intent='board_list', page=pagenum), get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path)) soup = BeautifulSoup(raw, 'html.parser') if soup.find('a', href='?blog={}&page=2'.format(blog_id, pagenum + 1)): replace_internal_resources(soup, saveTo=get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path)) pagenum += 1 else: replace_internal_resources(soup, saveTo=get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path)) pagenum += 1 raw = dl_file(get_url(blog_id=blog_id, intent='board_list', page=pagenum), get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path)) soup = BeautifulSoup(raw, 'html.parser') replace_internal_resources(soup, saveTo=get_local_path(intent='board_list', pagenum=pagenum, dl_path=dl_path)) break # Archive Dates archive_dates = [x.get('value') for x in main_soup.find('select', id="PeriodsForUser").find_all('option')] archive_dates.sort(key=lambda x: x.split('/')[1] + "{:02d}".format(int(x.split('/')[0])), reverse=True) print("Downloading archive pages...") if ENABLE_PROGRESSBAR: bar = progressbar.ProgressBar(max_value=len(archive_dates)).start() for i, date in enumerate(archive_dates): pagenum = 1 pages_count = 1 next_month = archive_dates[i - 1] if i > 0 else None previous_month = archive_dates[i + 1] if i < len(archive_dates) - 1 else None while pagenum <= pages_count: month, year = date.split('/') raw = dl_file(get_url(blog_id=blog_id, month=month, year=year, pagenum=pagenum), get_local_path(year=year, month=month, pagenum=pagenum, dl_path=dl_path)) soup = BeautifulSoup(raw, 'html.parser') for tag in soup.find_all('a', href=re.compile('javascript:showCommentsHere')): post_ids.append(tag['href'].split('(')[1].split(',')[0]) t = soup.find('script', text=re.compile('navigateCount')) pages_count = int(t.text.strip().split('=')[1].strip(';')) if t else 1 logging.info("Pages count for {}/{}: {}".format(year, month, pages_count)) dl_and_replace_external_resources(soup, dl_path, fast=fast) replace_internal_resources(soup, next_month=next_month, previous_month=previous_month, saveTo=get_local_path(year=year, month=month, pagenum=pagenum, dl_path=dl_path)) pagenum += 1 if ENABLE_PROGRESSBAR: bar.update(i) if ENABLE_PROGRESSBAR: bar.finish() # Save Main Page replace_internal_resources(main_soup, previous_month=archive_dates[1] if len(archive_dates) > 1 else None, saveTo=get_local_path(dl_path=dl_path)) # Posts print("Downloading posts...") if ENABLE_PROGRESSBAR: bar = progressbar.ProgressBar(max_value=len(post_ids)).start() for i, postid in enumerate(post_ids): raw = dl_file(get_url(blog_id=blog_id, intent='posts', postid=postid), get_local_path(intent='posts', postid=postid, dl_path=dl_path)) soup = BeautifulSoup(raw, 'html.parser') dl_and_replace_external_resources(soup, dl_path, fast=fast) replace_internal_resources(soup, saveTo=get_local_path(intent='posts', postid=postid, dl_path=dl_path)) if ENABLE_PROGRESSBAR: bar.update(i) if ENABLE_PROGRESSBAR: bar.finish() # Comments print("Downloading comment pages...") if ENABLE_PROGRESSBAR: bar = progressbar.ProgressBar(max_value=len(post_ids)).start() for i, postid in enumerate(post_ids): pagenum = 1 raw = dl_file(get_url(blog_id=blog_id, intent='comments', postid=postid), get_local_path(intent='comments', postid=postid, dl_path=dl_path)) soup = BeautifulSoup(raw, 'html.parser') t = soup.find('table', id="Table3") pages_count = int(t.td.text.strip().split(' ')[-2]) if t else 1 logging.info("Comment Pages count for {}: {}".format(postid, pages_count)) dl_and_replace_external_resources(soup, dl_path, fast=fast) replace_internal_resources(soup, saveTo=get_local_path(intent='comments', postid=postid, dl_path=dl_path)) pagenum += 1 while pagenum <= pages_count: raw = dl_file(get_url(blog_id=blog_id, intent='comments', postid=postid, posnew=pagenum), get_local_path(intent='comments', postid=postid, pagenum=pagenum, dl_path=dl_path)) soup = BeautifulSoup(raw, 'html.parser') dl_and_replace_external_resources(soup, dl_path, fast=fast) replace_internal_resources(soup, saveTo=get_local_path(intent='comments', postid=postid, pagenum=pagenum, dl_path=dl_path)) pagenum += 1 if ENABLE_PROGRESSBAR: bar.update(i) if ENABLE_PROGRESSBAR: bar.finish() print(colorama.Fore.GREEN + "Done!" + colorama.Style.RESET_ALL)
# # urllib2.urlopen(url, "", 100000) # # re.findall("") html_doc = """ <div class="J-next-auto hide next-auto"><em>3</em> 秒后播放下一节</div> <div class="J-next-btn hide next-auto btn btn-green">下一节</div> <a href="/video/10687/0" class="review-course">重新观看</a> <div id="js-ques-box"></div> </div> </div> """ soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8') print '获取链接' links = soup.find_all('a') for link in links: print link.name, link['href'], link.get_text() print '获取指定url' link_node = soup.find('a', href='/video/10687/0') print link_node.name, link_node['href'], link_node.get_text() print '正则匹配' link_node = soup.find('a', href=re.compile(r'video')) print link_node.name, link_node['href'], link_node.get_text() print '获取div' link_node = soup.find('div', class_='J-next-auto hide next-auto') print link_node.name, link_node.get_text()
def download(id_all): if not os.path.exists('aicai'): os.mkdir('aicai') it=iter(range(1,15)) for id in id_all: result = requests.get(url_download+id, headers=header,stream=True) #print(result.headers) filename=str(next(it))+'.xls' with open('./aicai/'+filename,'wb+') as f: f.write(result.content) url_collect= 'https://live.aicai.com/pages/bf/sfc.shtml' url_download='https://live.aicai.com/bf/bfindex!export.htm?matchBFId=' result=requests.get(url_collect,headers=header) soup=BeautifulSoup(result.text,'html.parser') all_url=soup.find_all('div',{'class':'bf_ta_tit'}) all_url2=[] for ii in all_url: all_url2.append(ii.find('a')['value']) all_url=all_url2 id_all=[] #print(all_url) for x in all_url: x=str(x) # a,b=x.split('=')[2:4] # a=a.split('&')[0] # b=b.split('&')[0] id_all.append(x[39:47]) download(id_all) print('ok!')