def scrapeURLs(): # convert webpage to soup object r = Render('http://stats.nba.com/teams/') result = str(r.frame.toHtml().toAscii()) del r soup = BeautifulSoup(result, 'lxml') del result # identify links to each team's stats and game logs team_index = soup.find_all('div', {'class': 'team-block__links'}) soup.decompose() game_log_URLs = [] stats_URLs = [] # record urls for each team and each game log for team_num in range(0, len(team_index)): stats_URLs.append('http://stats.nba.com' + team_index[team_num].contents[3]['href']) game_log_URLs.append('http://stats.nba.com' + team_index[team_num].contents[5]['href']) # pickle urls pickle.dump(game_log_URLs, open('../../Data/gamelogURLs.pickle', 'wb')) pickle.dump(stats_URLs, open('../../Data/statsURLs.pickle', 'wb'))
def get_url_by_sid(self, sids): """ 接受歌曲id构成的字符串 :param sids: 歌曲id串 :return: 歌曲真实下载url """ search_url = 'http://play.baidu.com/data/music/songlink?songIds=%s' % sids buffers = StringIO() curl = pycurl.Curl() curl.setopt(pycurl.URL, search_url) curl.setopt(pycurl.USERAGENT, user_agent) curl.setopt(pycurl.WRITEDATA, buffers) curl.perform() curl.close() body = buffers.getvalue() soup = BeautifulSoup(body) song_lists = self.serialization.json_to_data(soup.text)['data']['songList'] soup.decompose() urls = [] for l in song_lists: link = l['songLink'] if not link: url = 'zzz' else: url = pattern.sub('', link) urls.append(url) return urls
def get_dlinks(source_url): """ 根据网页url抓取视频的下载链接 :param source_url: 原地址 :return 返回视频的真实下载链接 """ buffers = StringIO() curl = pycurl.Curl() curl.setopt(pycurl.URL, source_url) curl.setopt(pycurl.USERAGENT, user_agent) curl.setopt(pycurl.REFERER, refer_path) curl.setopt(pycurl.WRITEDATA, buffers) curl.perform() curl.close() # 获取str类型的数据 body = buffers.getvalue() soup = BeautifulSoup(body) # 拿到目标div content = soup.findAll('div', {'id': 'hi_addtab_1'})[1] soup.decompose() # 获取下载链接 result = [] tables = content.findAll('td', {'class': 'td_thunder'}) for td in tables: d_link = td.find('a') href = d_link['href'] target = d_link.text result.append((target, href)) return result
async def getWorkInfo(session: dict): allWorkInfo = [] tasks = [] s = requests.session() s.cookies.update(session) s.headers.update(ua) lock = asyncio.Lock() courseListURL = 'https://mooc2-ans.chaoxing.com/visit/courses/list?rss=1&start=0&size=500&catalogId=0&searchname=' html = s.get(url=courseListURL).content.decode() htmlBS = BeautifulSoup(html, 'lxml') for singleCourse in htmlBS.find_all(class_='course-info'): # 虽然连接是旧版本的,但是解析课程名称和教师名称更方便 tasks.append(parseOneCourse(singleCourse, s, allWorkInfo, lock)) htmlBS.decompose() print(tasks) await asyncio.wait(tasks) print("After every Task") print(allWorkInfo) return allWorkInfo
def get_total_pages(url): try: print 'Pegando o total de paginas do link: ' + url try: req = get(url) except Exception as er: print 'erro: ' + str(er) time.sleep(5) return get_total_pages(url) soup = BeautifulSoup(req.text, 'html.parser') ultimo = soup.select('#pagination-flickr > li')[-1].getText() # pega o útlimo elemento req.close() if ultimo == "Next": # se for next o ultimo elemento, pega o penultimo e percorre de novo possivel_ultimo = soup.select('#pagination-flickr > li')[-2].getText() soup.decompose() if 'page' in url: return int(get_total_pages(mother_of_urls.format(penultimo='page', ultimo=possivel_ultimo))) else: return int(get_total_pages(adjust_url(url, possivel_ultimo))) else: return int(ultimo) except Exception as e: print 'Exception: ' + str(e) time.sleep(5) get_total_pages(url)
def getPageTitle(node_url, s): """ Retrieve the title (from html) of a IFB node page. @param node_url String The url of the node @param s requests.Session Object corresponding to the connection session @return String The title of the web page """ while True: try: # An authorised request. r = s.get(node_url, timeout=TIMEOUT) # print(s.get(url).status_code) # etc... break except requests.exceptions.ConnectionError as e: # print(str(e) + "\nRetrying...") time.sleep(10) except requests.exceptions.ReadTimeout as e: # print(str(e) + "\nRetrying...") time.sleep(10) soup = BeautifulSoup(r.text, 'html.parser') title = str(soup.title.string) soup.decompose() return title
def download_imgs(): page_urls = get_page_urls() download_count = 0 for page_url in page_urls: source = requests.get(page_url).text soup = BeautifulSoup(source, 'lxml') flower_name = soup.find('h1').contents[0] if flower_name.endswith(' '): flower_name = flower_name[:-1] aTags = soup.find_all('a', {'data-fancybox': 'gallery'}) for i, a in enumerate(aTags): file_name = f"{flower_name}-{i+1}.jpg" img_url = a['href'] img_request = requests.get(img_url) if not os.path.exists('./img'): os.mkdir('./img') with open(f'./img/{file_name}', 'wb') as file: file.write(img_request.content) download_count += 1 print(f"#{download_count} - Downloading {img_url}") soup.decompose() print(f"{download_count} images downloaded")
def exciteTranlate01(string00, loop_time=0): dataBefore = {'before': string00} tranlationUrl = 'http://www.excite.co.jp/world/chinese/' sec01 = random.randrange(5, 15) print('計時' + str(sec01) + '秒') for time_sec in range(sec01, 0, -1): time.sleep(1) print('倒數 ' + str(time_sec) + ' 秒') print(str(time_sec) + '秒結束') print('傳送翻譯資料') try: tranlationResponse = requests.post(tranlationUrl, data=dataBefore).content print('接收已翻譯資料') tranlationHtml = BeautifulSoup(tranlationResponse, 'html.parser') result00 = tranlationHtml.find(id="after").string except: if loop_time == 10: print('翻譯失敗') else: print('try除錯區,loop次數:', loop_time) result00 = exciteTranlate01(string00, loop_time + 1) if result00 is None: print('None除錯區,loop次數:', loop_time) result00 = exciteTranlate01(string00, loop_time + 1) elif loop_time == 10: print('翻譯失敗') tranlationHtml.decompose() print('片段翻譯結果:') print(result00) return result00
def scrape_google_docs_html(self, text: str): strainer = SoupStrainer(property=["og:title", "og:description"]) soup = BeautifulSoup(text, "lxml", parse_only=strainer) # Use lxml parser to speed things up if soup is None: return None meta_tags = soup.find_all("meta") if not meta_tags: return None try: title = meta_tags[0]['content'] description = meta_tags[1]['content'] except (IndexError, KeyError): return None if title.endswith(' - Google Docs'): title = title[:-14] if description.endswith('...'): description = description[:-3] soup.decompose() # Garbage collection return {'title': title, 'description': description}
def get_link_from_iframe(self, html_parser, default_value, iframe_id, class_name): if html_parser.select(iframe_id): iframe_response = requests.get(html_parser.select(iframe_id)[0].attrs['src']) iframe_parser = BeautifulSoup(iframe_response.content, "html.parser") link = iframe_parser.select(class_name)[0].get('href') print(link) iframe_parser.decompose()
def worker(): global linkPool while terminator.is_alive(): HTML = None page = None newTask = PagesToCrawl.get() # Get next work task. node = createNodeFromTuple(newTask) # Create new node. HTML = openURLAsHTML(node) # Open URL. if node['dead'] == 0: # If URL is live... page = BeautifulSoup(HTML.read().decode('utf-8', 'ignore'), "lxml") # ...scrape data. scrapeNodeData(node, page) # If the page is live and gave a valid text/html response, it should be an option for the next parent. if node['dead'] == 0 and HTML.info().get_content_type() == "text/html": with pool_lock: linkPool.append(copy.deepcopy(newTask)) # Garbage Collection if HTML != None: HTML.close() if page != None: page.decompose() # With lock in place, append result to temporary data set. with tier_lock: tierResults.append(copy.deepcopy(node)) PagesToCrawl.task_done() # Tell manager that task is complete. sys.exit()
def get_roster_advanced_stats(team, season_end_year): r = get( f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fteams%2F{team}%2F{season_end_year}.html&div=div_advanced' ) if r.status_code == 200: soup = BeautifulSoup(r.content, 'html.parser') table = soup.find('table') df = pd.read_html(str(table))[0] soup.decompose() columns = [ 'Rk', 'Unnamed: 1', 'Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'Unnamed: 17', 'OWS', 'DWS', 'WS', 'WS/48', 'Unnamed: 22', 'OBPM', 'DBPM', 'BPM', 'VORP' ] empty = ['' for _ in range(len(df))] new_df = pd.DataFrame(columns=columns) for i in range(len(columns)): if columns[i] in df.columns: new_df[columns[i]] = df[columns[i]] else: new_df[columns[i]] = empty new_df['YEAR'] = [ str(season_end_year - 1) + "-" + str(season_end_year)[-2:] for _ in range(len(df)) ] return new_df
def get_roster(team, season_end_year): r = get( f'https://widgets.sports-reference.com/wg.fcgi?css=1&site=bbr&url=%2Fteams%2F{team}%2F{season_end_year}.html&div=div_roster' ) df = None if r.status_code == 200: encoding = r.encoding if 'charset' in r.headers.get( 'content-type', '').lower() else None soup = BeautifulSoup(r.content, 'html.parser', from_encoding=encoding) print("Finding table for: " + str(season_end_year) + " " + str(team)) table = soup.find('table') print("Reading html") df = pd.read_html(str(table))[0] print("Done") soup.decompose() print("Transforming df") df.columns = [ 'NUMBER', 'PLAYER', 'POS', 'HEIGHT', 'WEIGHT', 'BIRTH_DATE', 'NATIONALITY', 'EXPERIENCE', 'COLLEGE' ] df['BIRTH_DATE'] = df['BIRTH_DATE'].apply(lambda x: pd.to_datetime(x)) df['NATIONALITY'] = df['NATIONALITY'].apply( lambda x: x.upper() if type(x) == str else "N/A") df['YEAR'] = [ str(season_end_year - 1) + "-" + str(season_end_year)[-2:] for _ in range(len(df)) ] print("Done") return df
def scrape_data_yahoo(url): ''' scrape data from Yahoo finance ''' data = [] try: conn = urlopen(url) soup = BeautifulSoup(conn, "html.parser") locate_tag = soup.find("div", {"id": "Main"}) table_tag = locate_tag.find_next("table", {"data-test": "historical-prices"}) tbody_tag = table_tag.find_next("tbody") # scrape the data for row in tbody_tag.find_all('tr'): col = row.find_all('td') # the first is date, the other are numbers data.append([col[0].find_next('span').text]+ [float(td.find_next('span').text.replace(',','')) for td in col[1:]]) return to_pandas_data_frame(np.array(data)) finally: # properly release the resources try: if soup: soup.decompose() if conn: conn.close() except NameError: pass
def main(): global args, celsius # Valores default de los parámetros args={'dia' : 0,'min': False, 'max' : False, 'now' : False, 'temp' : True, 'como' : True} # Validar argumentos recibidos y configurar variables parseArgs() ## Variables de parseo html # Archivo donde se guardará temporalmente la página de accuweather html_doc = "/tmp/clima.html" #ID's de div's importantes tiempodiv_id = "feed-tabs" gradosdiv_id = "bt-menu-settings" div_hoy_id = "detail-now" div_hoy_mM_id = "feature-history" ##Lógica # Obtener página web de accuweather para santiago if DEBUG: print "Obteniendo datos desde Accuweather..." with open(os.devnull, "wb") as devnull: subprocess.check_call(["wget","http://www.accuweather.com/<AJUSTAR PARA ZONA DESEADA>", "-O",html_doc], stdout=devnull, stderr=subprocess.STDOUT) # Obtener sección de interés soup = BS(open(html_doc)) # Ver unidad de medición celsius = inCelsius(soup,gradosdiv_id) # Guardar temperaturas y características para los 5 días forecast = parseForecast(soup,tiempodiv_id,div_hoy_id,div_hoy_mM_id) soup.decompose() # escribir resultado del forecast según argumentos entregados print getForecast(forecast) with open(os.devnull, "wb") as devnull: subprocess.check_call(["rm",html_doc], stdout=devnull, stderr=subprocess.STDOUT)
def makeappdx(page): srd = SoupStrainer('div', id='container') div = BeautifulSoup(page, parse_only=srd).div nav = div.find('div', id='navigation') nav.decompose() title = div.center.get_text(strip=True) div.center.decompose() font = div.find_all('font', size='2', color=None) for f in font: f.unwrap() for p in div.find_all('p'): p['class'] = 'ZFY' p.name = 'div' blank = div.find('div', class_='blank') if blank: blank.decompose() ft = div.find('div', id='footer') if ft: ft.decompose() div.attrs.clear() div['class'] = 'oH1' formatcontent(div) text = cleansp(div.encode('iso-8859-1')) div.decompose() return ''.join(['<div class="xsv">', title, '</div>', text])
def formatabbr(page): srd = SoupStrainer('div', id='container') div = BeautifulSoup(page, parse_only=srd).div nav = div.find('div', id='navigation') nav.decompose() tbl = div.find('table') tbl.name = 'div' tbl.attrs.clear() tbl['class'] = 'oH1' tdr = div.find_all(name=re.compile(r't[dr]', re.I)) for t in tdr: t.unwrap() for p in div.find_all('p'): p['class'] = 'ZFY' p.name = 'div' blank = div.find('div', class_='blank') if blank: blank.decompose() ft = div.find('div', id='footer') if ft: ft.decompose() formatcontent(div) div.attrs.clear() div['class'] = 'RmY' text = cleansp(div.encode('iso-8859-1')) div.decompose() return ''.join(['<link rel="stylesheet"href="ety.css"type="text/css">', text])
def load_gamelog_stats(self, year): # Check if it's already cached on disk. if not os.path.isfile(self.get_filename_path(year)): content = urllib2.urlopen(self.get_game_log_url(year)).read() with open(self.get_filename_path(year), 'w') as f: f.write(content) # Drink the soup. soup = BeautifulSoup(open(self.get_filename_path(year)), 'html.parser') game_log = [] for table in soup.find_all('table'): # Only load regular season stats. if table.get('id', '') == 'pgl_basic': for game in table.find_all('tr'): game_data = {} for stat in game.find_all('td'): data_stat = stat.get('data-stat', None) if data_stat == self.DATE_STAT: game_data[data_stat] = datetime.strptime( stat.get_text(), self.DATE_FORMAT) elif data_stat and data_stat in self.STATS_TO_COLLECT: game_data[data_stat] = int(stat.get_text()) if game_data: game_log.append(game_data) # Break out once we've found and parsed the regular season stats. break soup.decompose() self.stats[year] = pd.DataFrame(game_log).set_index(self.DATE_STAT)
def parse_product(self, response): soup = BeautifulSoup(response.body, 'lxml') p = Product() for element, path in self.selectors.viewitems(): node = soup.select_one(path) if not node: continue if element == 'image': p[element] = url_fix(urljoin(response.url, node['src'])) else: p[element] = text(node) if 'name' in p and 'number' in p: p['url'] = response.url p['pricing'], p['discountcode'] = get_prices(soup) soup.decompose() yield p else: # Only follow links on non-product pages soup.decompose() for link in self.link_extractor.extract_links(response): yield Request(url=link.url)
def get_all_links(self, link): global SCANNED_LINKS global n_requests global n_requests_lock try: req = self.req_ses.get(link) with n_requests_lock: n_requests += 1 if not self.silent: print("[%s] T-ID: %s scanning -> %s" % (str(len(SCANNED_LINKS)), str(self.threadID), link)) except requests.exceptions.MissingSchema: #print('invalid url %s' % link) return None html_soup = BeautifulSoup(req.text, 'html.parser') links = [i.get("href") for i in html_soup.find_all('a')] links = [ e for e in links if e not in SCANNED_LINKS and e is not None and len(e) > 5 ] # file in list? search it and remove from crawling list (if present) html_soup.decompose() #THIS MADE THE TRICK, NO MORE RAM WASTED! # add the schema and base URL to links like "/something" for i in range(len(links)): if links[i][0] == "/": links[i] = self.URL + links[i] return links
def run(dates): logging.debug(f'Got {len(dates)} dates') for date in dates: logging.info(f"Starting to scrape matches {url}/en/results/{date}") page = requests.get(url + "/en/results/" + date, timeout=TIMEOUT) bs = BeautifulSoup(page.text, 'html.parser') match_links = add_lineup_to_match_link( get_match_links_for_league(bs, league_id)) bs.decompose() valid_links = [] for match_url, match_id in match_links: if match_id not in match_ids: valid_links.append([match_url, match_id]) results = pool.map(get_lineup_for_match, valid_links) for data_obj in results: data_obj["date"] = date with open(f'lineups/data_{date}.json', 'w') as outfile: json.dump(results, outfile) for _, match_id in valid_links: match_ids.append(match_id) pool.close()
def findcompanyregno(self): soup = None try: for l in self.sitelinks: r = requests.get(l, {"User-Agent": ua.random}) soup = BeautifulSoup(r.text, 'html.parser') pagetext = soup.findAll(text=True) output = '' for t in pagetext: if t.parent.name not in blacklist: output += '{} '.format(t) crns = re.findall(crn, output) crn_list = [] if crn != []: crn_list.extend(crns) for i in crn_list: if i not in self.crns: self.crns.append(i) except Exception as e: print("siteclassifier.findcompanyregno : " + str(e)) finally: if soup: soup.decompose() return
def formatabbr(page): srd = SoupStrainer('div', id='container') div = BeautifulSoup(page, parse_only=srd).div nav = div.find('div', id='navigation') nav.decompose() tbl = div.find('table') tbl.name = 'div' tbl.attrs.clear() tbl['class'] = 'oH1' tdr = div.find_all(name=re.compile(r't[dr]', re.I)) for t in tdr: t.unwrap() for p in div.find_all('p'): p['class'] = 'ZFY' p.name = 'div' blank = div.find('div', class_='blank') if blank: blank.decompose() ft = div.find('div', id='footer') if ft: ft.decompose() formatcontent(div) div.attrs.clear() div['class'] = 'RmY' text = cleansp(div.encode('iso-8859-1')) div.decompose() return ''.join( ['<link rel="stylesheet"href="ety.css"type="text/css">', text])
def run(self) -> Counter: if self.use_lxml: elem_iter = lxml.etree.iterparse(self.xml_file, ["start", "end"], load_dtd=True) else: elem_iter = ET.iterparse(self.xml_file, ["start", "end"]) root = None for (event, element) in elem_iter: if (root is not None) and event == "start": root = element continue if not (element.tag in self.record_tags and event == "end"): continue if self.use_lxml: soup = BeautifulSoup(lxml.etree.tostring(element), "xml") else: soup = BeautifulSoup(ET.tostring(element), "xml") for record in soup.find_all(): if record.name not in self.record_tags: continue self.importer.push_record(record) record.decompose() soup.decompose() element.clear() if root is not None: root.clear() counts = self.importer.finish() print(counts, file=sys.stderr) return counts
def generate_rss_item(self, url): """ :param url: :return: """ print('fetch: %s' % url) html = self.request_html(url) if not html: self.url = None return None soup = BeautifulSoup(html, 'html.parser') title = soup.find('title').text div = soup.find('div', {'class': 'entry-location'}) if not div: soup.decompose() self.url = None return None self.url = div.find('a')['href'] soup.decompose() rss_item = PyRSS2Gen.RSSItem( title=title, link=url, description=title, pubDate=datetime.datetime.now() ) return rss_item
def getPret_BD(url): content = urlopen(url).read() # fisier = open(url, "r", encoding = "utf8") # content = fisier.read() soup = BeautifulSoup(content, "html.parser") #soup = BeautifulSoup(url, "html.parser") pret_oferta = soup.find('span', {'class': 'price'}) pret_vechi = soup.find('span', {'class': 'retail-value'}) titlu = soup.find('div', {'class': 'title'}) soup.decompose() #fisier.close() #for preturi in spans print("Titlu: " + titlu.text) print("Pret Oferta: " + pret_oferta.text) print("Pret Vechi: " + pret_vechi.text) pret_ofer_filt = re.match('[0-9]+', pret_oferta.text) pret_vech_filt = re.match('[0-9]+', pret_vechi.text) return titlu.text, str(pret_ofer_filt.group(0)), str( pret_vech_filt.group(0))
def addAllFollowers(start_username: str, depth: int): if depth > MAX_DEPTH: return link = getFollowersLink(start_username) hasNext = True page = 1 while hasNext: r = urllib.request.urlopen(link + str(page) + '/').read() soup = BeautifulSoup(r, "html.parser") person_summary = soup.findAll('div', class_='person-summary') page += 1 for person in person_summary: username_html = str(person.find('a', class_='name')) username = re.search('(?<=href="/).*(?=/">)', username_html).group(0) username_html = None gc.collect() users.add(username) # use a regex to find a string that starts with href="/ (dont capture it) # and ends with \"> (also dont capture) # group(0) returns the first matched object if depth < MAX_DEPTH: addAllFollowers(username, depth+1) if soup.find('a', class_='next') is None: hasNext = False for a in person_summary: a.decompose() soup.decompose() gc.collect() print("done gathering followers: ", start_username)
def write(input, output, wordlist): fp = open(input, 'r', encoding="utf-8") soup = BeautifulSoup(fp, "html.parser") for div in soup.body.find_all('div', recursive=False): # print(div.a.text) hasword = False # print('\n') # print('\n') for word in wordlist: if word in str(div.a.text): hasword = True break if not hasword: div.decompose() # except Exception as e: # print(e) # print(div) fw1 = open(output, 'w', encoding="utf-8") fw1.write(soup.prettify()) fw1.close() fp.close() soup.decompose() gc.collect()
def SpiderPageWeb(self, start): pageURL = self.GetPageURL(start) headers = Util.GetHeaders(Host=self.HostURL, Referer=self.GetRefererURL(start)) req = Util.RequestURLByGet(pageURL, headers, {}) if not req is None: soup = BeautifulSoup(req.text, 'lxml') elements = soup.findAll(href=self.re_pattern_bloglink) coroutines = [] for index, ele in enumerate(elements): blogLink = ele.get('href') blogTitle = ele.get('title') msg = '---------- SpiderBlogWeb:' + ('[%d/%d]' % (index+1,len(elements))) + '['+blogTitle+']' coroutines.append(gevent.spawn(self.SpiderBlogWeb,blogLink,pageURL,msg)) #self.SpiderBlogWeb(blogLink, pageURL) gevent.joinall(coroutines) soup.decompose() if len(elements) <= 0: print '!!!!!!!!!! No blog link found !!!!!!!!!!' print req.text # self.filePage.write(pageURL + '\n') # self.filePage.flush() else: print '!!!!!!!!!! SpiderPageWeb Request is None !!!!!!!!!!'
def extract(self): self.assets = None try: with self.fsal.open(self.path, 'r') as html_file: dom = BeautifulSoup(html_file, self.PARSER) except Exception: msg = (u"Metadata extraction failed, error opening: " u"{}".format(self.path)) logging.exception(msg) raise self.MetadataError(msg) else: data = {} for meta in dom.find_all('meta'): if all(key in meta.attrs for key in ('name', 'content')): key = meta.attrs['name'] value = meta.attrs['content'] data[key] = value # Old style html files may have the language set via # <meta http-equiv="content-language"> pragma = meta.get('http-equiv', '').lower() if pragma == 'content-language': data['language'] = meta.get('content') if dom.html: lang = dom.html.get('lang') or data.get('language', '') data['language'] = lang if dom.title: data['title'] = dom.title.string # assets are not directly part of the metadata, but are needed # to be accessed from within the processor, so it's kept as an # instance attribute only self.assets = self.extract_asset_paths(dom) dom.decompose() return data
def tokener(xmldata): import corpkit """print word, using good lemmatisation""" from bs4 import BeautifulSoup import gc open_classes = ['N', 'V', 'R', 'J'] result = [] just_good_deps = SoupStrainer('tokens') soup = BeautifulSoup(xmldata, parse_only=just_good_deps) for token in soup.find_all('token'): word = token.word.text query = re.compile(r'.*') if re.search(query, word): if lemmatise: word = token.lemma.text if just_content_words: if not token.pos.text[0] in open_classes: continue result.append(word) # attempt to stop memory problems. # not sure if this helps, though: soup.decompose() soup = None data = None gc.collect() return result
def get_news_articles(): url = "https://hk.news.appledaily.com/realtime/realtimelist/all?page=local" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } r = requests.get(url, headers=headers) r.encoding = "utf-8" only_div = SoupStrainer("div", {"class": "text"}) soup = BeautifulSoup(r.content, features="html.parser", parse_only=only_div) elements = list(soup.find_all("div", {"class": "text"})) soup.decompose() links = [] for element in elements: a = element.find('a') href = a['href'] links.append(href) links = links pool = multiprocessing.Pool(2, maxtasksperchild=1) result = pool.map_async(retrieve_url, links).get() pool.close() pool.join() result = [ l for l in result if "/local/" in l or "/international/" in l or "/china/" in l or "/breaking/" in l ] return result
def parse_repository(username, url, filename): user_cache = CACHE.get(username) if not user_cache: user_cache = Set() CACHE[username] = user_cache if url in user_cache: return [] user_cache.add(url) url = GITHUB_URL + url text = read_page(url) soup = BeautifulSoup(text) commits = soup.find_all(class_='gobutton') urls = [] for a in commits: urls.append(a['href']) soup.decompose() results = [] for url in urls: data = parse_commit(username, url) results.append(data) with contextlib.closing(open(filename, 'ab')) as csvfile: writer = csv.writer(csvfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_ALL) for commit in results: writer.writerow(commit) csvfile.close()
def main(max_page): for i in range(1, max_page): conn = sqlite3.connect("../dogDrip.db") cur = conn.cursor() address = cur.execute( "select address from dogDrip where id={ID}".format( ID=i)).fetchall()[0][0] conn.close() print(i) url = address request = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) try: global bs_obj global html global title dog_drip_html = urlopen(request) bs_obj = BeautifulSoup(dog_drip_html.read(), "html.parser") html = str(bs_obj.find_all('div', class_="ed clearfix margin-vertical-large")[0])\ .replace('href="', 'href="https://www.dogdrip.net')\ .replace('src="', 'src="https://www.dogdrip.net')\ .replace("'", '"') title = str(bs_obj.find_all('h4')[0])\ .replace("'", '"') conn = sqlite3.connect("../dogDrip.db") cur = conn.cursor() cur.execute( """update dogDrip set HTTP = '{HTTP}' where id={ID}""".format( HTTP='%s %s' % (title, html), ID=i)) bs_obj.decompose() dog_drip_html.close() conn.commit() conn.close() except urllib.error.HTTPError: print('Oops! The post is deleted!') conn.close()
def remove_invalid_xml_chars2(html_string): soup = BeautifulSoup(html_string, 'html5lib') text = soup.get_text() soup.decompose() return re.sub( u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\u10000-\u10FFFF]', '', text)
def getPret_B24(url): content = urlopen(url).read() # fisier = open(url, "r", encoding = "utf8") # content = fisier.read() soup = BeautifulSoup(content, "html.parser") pret_oferta = soup.find('span', {'class': 'text-value js-price-value'}) pret_vechi = soup.find('span', {'class': 'text-rrp js-text-rrp'}) titlu = soup.find('h1', {'class': 'col-md-14 col-lg-14'}) soup.decompose() #fisier.close() #for preturi in spans print("Titlu: " + titlu.text) print("Pret Oferta: " + pret_oferta.text) print("Pret Vechi: " + pret_vechi.text) pret_ofer_filt = re.match('[0-9]{3}', pret_oferta.text) pret_vech_filt = re.match('.+([0-9]{3}).+', pret_vechi.text) return titlu.text, str(pret_ofer_filt.group(0)), str( pret_vech_filt.group(1))
def remove_invalid_xml_chars(html_string): soup = BeautifulSoup(html_string, 'html5lib') text = soup.get_text() soup.decompose() # return re.sub(r'[\xE4C6\x00-\x1F\x7F-\x9F%&<>]+','', text) # https://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python return ''.join(c for c in text if _valid_xml_char_ordinal(c))
def iterate_pages(url): try: total_pages = get_total_pages(url) url_splited_len = len(url.split('/')) print 'preparando para iterar pela url ' + url for value in range(1, total_pages + 1): print 'pagina %i de %i' % (value, total_pages) # pega todos elementos da pagina 'value' if 'page' in url: req = get(mother_of_urls.format(penultimo='page', ultimo=str(value))) else: req = get(adjust_url(url, str(value))) soup = BeautifulSoup(req.text, 'html.parser') print 'pegando os elementos da url ' + req.url page_elements = soup.findAll('div', { 'class': 'mainBox' }) iterate_page_elements(page_elements, req.url) soup.decompose() req.close() anime_list.clear_list() anime_list.serialize_list() except Exception as e: print 'Exception: ' + str(e) time.sleep(5) iterate_pages(url)
def get_wikipedia_links(input_text): """Gets en.wikipedia.org link in input_text. If it can't be found, returns []""" soup = BeautifulSoup(input_text, "lxml") fixed_urls = [] urls = re.findall(r'(https?://[^\s]+)', input_text) for url in soup.findAll('a'): try: fixed_urls.append(url['href']) except Exception: pass """Deletes duplicates""" done_urls = [] for i in fixed_urls: if i not in done_urls: done_urls.append(i) """Deletes urls that contain a file extension""" fixed_urls = [] for url in done_urls: for extension in media_extensions: if not extension.lower() in url.lower(): fixed_urls.append(url) break soup.decompose() return fixed_urls
def process_user(username, fullname): filename = 'github/{}.csv'.format(username) filename_tmp = '{}.tmp'.format(filename) with open(filename_tmp, 'a'): os.utime(filename_tmp, None) uri_param = httplib2.iri2uri(fullname.replace(' ', '+')) url = u'{}/search?q={}&type=Users'.format(GITHUB_URL, uri_param) text = read_page(url) soup = BeautifulSoup(text) user_info = soup.find(class_='user-list-info') if not user_info: os.rename(filename_tmp, filename) soup.decompose() return a = user_info.find('a') github_username = a['href'][1:] with open(filename_tmp, 'w') as f: f.write(github_username + '\n') f.close() print "link stackoverflow '{}' to github '{}'".format( username, github_username) soup.decompose() commits = process_days(github_username, filename_tmp) os.rename(filename_tmp, filename) if github_username in CACHE: del CACHE[github_username]
def feed(self, data): # truncate first to save memory self.dom.truncate(0) # for python3 compatibility self.dom.seek(0) soup = BeautifulSoup(data, 'html5lib') # since soup() is not a generator, it should be fine to iterate and # edit # handle code block for div in soup.select('div[class^=highlight-]'): self.handle_highlight(div, soup) # kill useless navigation for div in soup.select('div.related'): for child in div.children: if child.name in ['h1', 'h2', 'h3', 'h4', 'h5']: if child.text == 'Navigation': div.decompose() break # kill table of content navigation for div in soup.select('div.sphinxsidebarwrapper'): for child in div.children: if child.name in ['h1', 'h2', 'h3', 'h4', 'h5']: if child.text == 'Table Of Contents': div.decompose() break # filter and edit tags for tag in soup(): if tag.name not in ALLOWED_TAGS: if tag.name in MERCiFUL_TAG: tag.replace_with_children() continue tag.decompose() continue final_attr = dict() if tag.has_attr('id'): final_attr['id'] = tag['id'] if tag.has_attr('style'): final_attr['style'] = tag['style'] result = getattr(self, 'handle_' + tag.name, self.handle_default)(tag, final_attr) if result is False: tag.decompose() continue elif result is True: continue tag.attrs.clear() tag.attrs.update(final_attr) self.output_dom(soup.html) self.dom.seek(0) soup.decompose()
def scrape_bfi_films(voters_list, filmid_manual_dict): # initialize lists of films with header labels film_list = [['filmid', 'title', 'director', 'country', 'year', 'genre', 'type', 'category']] # add manual filmids to list for k,v in filmid_manual_dict.items(): film_list.append([v[0], k.encode('UTF-8'), v[2].encode('UTF-8'), '', v[1].encode('UTF-8'), '', '', '']) # get list of unique filmids from voter_list filmid_list = [] for i in voters_list: for j in i[5:-1]: filmid_list.append(j) filmid_list = set(filmid_list) # visit each of the film webpages for filmid in filmid_list: if str(filmid)[0] != '4': continue film_soup = BeautifulSoup(requests.get(film_url+str(filmid)).content, 'lxml') # extract film title and append with film id film_info = [filmid, film_soup.find('title').contents[0].split('(')[0].strip().encode('UTF-8')] # extract director(s) try: film_info.append(" & ".join([director.text for director in film_soup.find('p', text=re.compile('Director.*'), attrs={'class':'row-label'}).findNext('p').findAll('a')]).encode('UTF-8')) except: film_info.append('') # extract country(ies) try: film_info.append(" & ".join([country.text for country in film_soup.find('p', text=re.compile('Countr.*'), attrs={'class':'row-label'}).findNext('p').findAll('span')]).encode('UTF-8')) except: film_info.append('') # extract year, genre, type, and category for k in ['Year', 'Genre', 'Type', 'Category']: try: film_info.append(film_soup.find('p', text=k, attrs={'class':'row-label'}).findNext('p').find('span').contents[0].encode('UTF-8')) except: film_info.append('') # append info on this single film to the list of all films film_soup.decompose() film_list.append(film_info) print(film_info) # write film info to csv with open(csv_dir+'/bfi-films.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerows(film_list) f.close() return film_list
def get_rows(self, html_path): """Return rows for locations.""" html_file = open(html_path, 'rb') soup = BeautifulSoup(html_file.read(), "html.parser") rows = self.parse_rows(soup) soup.decompose() html_file.close() return rows
def get_top_movie_list(self, source_url): """ 拿到推荐的250部经典电影列表 :param source_url: 源网址 :return: """ curl = pycurl.Curl() curl.setopt(pycurl.USERAGENT, user_agent) curl.setopt(pycurl.REFERER, refer_url) page = 0 extra_url = "" target_list = [] while 1: print '正在处理第%d页' % (page + 1) target_url = source_url + extra_url buffers = StringIO() curl.setopt(pycurl.URL, target_url) curl.setopt(pycurl.WRITEDATA, buffers) curl.perform() body = buffers.getvalue() buffers.close() soup = BeautifulSoup(body, "html.parser") content = soup.find('div', {'id': 'content'}) soup.decompose() clear_fix = content.find('div', {'class': 'article'}) subject_list = clear_fix.find('ol', {'class': 'grid_view'}).findAll('li') # 这里用试探法测试页数 if not subject_list: break for item in subject_list: # 获取电影名称, 评分 name = replace_pattern.sub('', item.find('div', {'class': 'hd'}).find('a').text) # 处理少于10人评价的特殊情况 rate_0 = item.find('div', {'class': 'star'}).find('span', {'class': 'rating_num'}) if not rate_0: continue rates = replace_pattern.sub('', rate_0.text) target_list.append((name, rates)) page += 1 extra_url = "?start=%d&filter=" % (25 * page) print('已处理完最后一页') curl.close() file_name = '%s.txt' % u"top250电影集合" self.write_to_file(target_list, u"top250电影/", file_name)
def get_book_list(self, category): """ 根据指定类别获取对应的书籍列表 :param category: 分类 :return: """ curl = pycurl.Curl() curl.setopt(pycurl.USERAGENT, user_agent) curl.setopt(pycurl.REFERER, refer_url) page = 0 while 1: print '正在处理第%d页' % (page + 1) url = 'http://book.douban.com/tag/%s?start=%d' % (category, page * 20) buffers = StringIO() curl.setopt(pycurl.URL, url) curl.setopt(pycurl.WRITEDATA, buffers) curl.perform() body = buffers.getvalue() buffers.close() soup = BeautifulSoup(body, "html.parser") content = soup.find('div', {'id': 'subject_list'}) soup.decompose() # 这里用试探法测试页数 clear_fix = content.find('div', {'class': 'clearfix'}) if not clear_fix: break subject_list = content.find('ul', {'class': 'subject-list'}).findAll('li', {'class': 'subject-item'}) target_list = [] for item in subject_list: # 获取书名, 评分以及出版信息 name = replace_pattern.sub('', item.find('h2').find('a').text) pub = replace_pattern.sub('', item.find('div', {'class': 'pub'}).text) # 处理少于10人评价的特殊情况 rate_0 = item.find('div', {'class': 'star clearfix'}).find('span', {'class': 'rating_nums'}) if not rate_0: continue rates = replace_pattern.sub('', rate_0.text) # 抓取评分高于8.5的书籍 if float(rates) > 8.5: target_list.append((name, rates, pub)) file_name = '%s%d.txt' % (category, page + 1) page += 1 self.write_to_file(target_list, '%s/' % category, file_name) curl.close() print('已处理完最后一页')
def parse_html_(self, text): """ Helper function for dealing with an HTML document """ soup = BeautifulSoup(text, 'lxml') title = soup.find('title').text body = self.parse_body_(soup) # Get rid of the soup soup.decompose() del soup return { 'title': title, 'body': body }
def crawl_cate(cate_url, f): request = urllib2.Request(cate_url, "", header) cate_page = urllib2.urlopen(request) shoplist_ss = SoupStrainer('a', attrs={"class": "BL", "href": re.compile('shop')}) shoplist = BeautifulSoup(cate_page, parseOnlyThese=shoplist_ss) i = 0 for a in shoplist: i = i + 1 print "shop:%d" % i shop_url = "http://www.dianping.com/%s" % a.get('href') if shop_url in shop_list: continue else: shop_list[a.string] = 1 f.write("%s\t%s\t" % (a.string.encode('utf-8'), shop_url.encode('utf-8'))) crawl_shop(shop_url, f) shoplist.decompose()
def get_rows(self, html_path): """ Return rows for details table HTML. :param html_path: A path to a sale file. Ex. '/path/OPR123456789.html' :type html_path: string :returns: A list of the rows in the details table. """ html_file = open(html_path, 'rb') soup = BeautifulSoup(html_file.read(), "html.parser") rows = self.parse_rows(soup) soup.decompose() html_file.close() return rows
def crawlPage(site, title, maxDepth, pages, links, restricted = False, siteBase = ""): global titles try: print("Crawling " + site + ", with maxDepth = " + str(maxDepth)) http = httplib2.Http() status, response = http.request(site) soupPage = BeautifulSoup(response, "html.parser", parse_only=SoupStrainer('a')) for link in soupPage: if link.has_attr('href'): linkedPage = link['href'] linkedPage = urljoin(site, linkedPage) print("Getting title for " + linkedPage) try: if not linkedPage in titles: soup = BeautifulSoup(urllib2.urlopen(linkedPage), "html.parser") linkTitle = soup.title.string soup.decompose() #titles[linkedPage] = linkTitle else: linkTitle = titles[linkedPage] links.add((title, linkTitle)) if not linkTitle in pages and not "youtube" in linkedPage and not (restricted and not siteBase in linkedPage): pages.add(linkTitle) if (maxDepth > 1): crawlPage(linkedPage, linkTitle, maxDepth-1, pages, links, restricted, siteBase) except Exception as e: print("Error parsing " + linkedPage + "! {0}".format(e)) links.add((title, linkedPage[linkedPage.find("http:\\")+7:])) if not linkedPage[linkedPage.find("http:\\")+7:] in pages and not (restricted and not siteBase in linkedPage): pages.add(linkedPage[linkedPage.find("http:\\")+7:]) if (maxDepth > 1): crawlPage(linkedPage, linkTitle, maxDepth-1, pages, links, restricted, siteBase) #pages.add(linkedPage) soupPage.decompose() except Exception as e: print ("Error on site " + site + ": {0}".format(e)) gc.collect()
def SpiderBlogWeb(self, blogURL, refererWeb, msg): headers = Util.GetHeaders(Host=self.HostURL, Referer=refererWeb) req = Util.RequestURLByGet(blogURL, headers, self.cookies) print msg if req: soup = BeautifulSoup(req.text, 'lxml') elements= soup.findAll(src=self.re_pattern_blogimg) threads = [] for ele in elements: photoURL = ele.get('src') # needDownload = Util.DownloadFile(photoURL), self.PhotoPath) threads.append(gevent.spawn(Util.DownloadFile, photoURL, self.PhotoPath, True)) # print str(needDownload), photoURL gevent.joinall(threads) soup.decompose() # self.fileBlog.write(blogURL + '\n') # if len(elements) > 0: # print '' time.sleep(random.uniform(self.SleepTimeMin, self.SleepTimeMax))
def parse(self): """ The main parse method. Need to call me before getting output. """ self.result_storage = [] for xml_file in self.xml: # read in nesss xml file, turn into soup obj, and validate xml_obj=open(xml_file,'ro') xml_parser = BeautifulSoup(xml_obj,'xml') self.ValidateXML(xml_parser) # iterate through hosts found in report for target in xml_parser.find_all('ReportHost'): # skip hosts that are caught in the filter if self.FilterThisHost(target) == True: continue # turn host into a host object target_obj = HostObject(target, self.troll_user_check, self.troll_link) # iterate over vulnerabilities of that host for vuln in target.find_all('ReportItem'): # filter out open ports and insert into HostObject if ( vuln['pluginFamily'] == "Port scanners" and vuln['severity'] == "0"): target_obj.insert_report_item(vuln) # if the vulnerability is not filtered, feed into Hostobj elif self.FilterThisVuln(vuln) == False: target_obj.insert_report_item(vuln) # if any vulnerabilities exist in that host, append that to # a list of host objects if target_obj.vulns: self.result_storage.append(target_obj) # demantle parser object, close xml file to free memory xml_parser.decompose() xml_obj.close() # close out the link to the inventory system lookup DB self.troll_link.Close()
def crawlpage(url,key,currdepth): currUrls = [] append= currUrls.append join=urllib.parse.urljoin try: htmltext = urllib.request.urlopen(urls[0]) # 1 second delay as courtesy time.sleep(1) the_text = htmltext.read() # converting the text in lowercase for keyphrase search textStr = str(the_text).lower() except: print('no idea what is happening ', sys.exc_info()) soup = BeautifulSoup(the_text) soup.prettify('utf-8') # get canonical link from the document canon = soup.find("link", {"rel":"canonical"}) canonicalurl = canon['href'] # we will be dealing with canonical url and not with the url given in document if canonicalurl not in visited and key.lower() in textStr: visited.add(canonicalurl) file.write(canonicalurl + '\n') # Design decision : # We will not need any links from pages at depth 3 # so I will be skipping them. # This increases the speed of the program byh atleast 10% # No a good idea for actual crawler if currdepth !=2: for tag in soup.findAll('a', href=True): x= tag['href'] link = join(url,x) # trim string from # # actually not needed because of cannonical urls # used as added performace enhancement link = link.split('#')[0] if validLink(link): append(link) soup.decompose() htmltext.close() return currUrls
def fetch(url): items = [] req = Request(url) req.add_header('User-Agent', UserAgent().random) conn = urlopen(req).read() document = BeautifulSoup(conn, 'html.parser') conn.close() for table in document.findAll('table', {'class': 'wikitable'}): for row in table.findAll('tr', {'id': True}): columns = row.findAll('td') item = {} item['name'] = columns[0]['data-sort-value'] item['type'] = columns[1]['data-sort-value'] item['level'] = columns[2].text try: description = [] for modifier in columns[len(columns) - 1].div.findAll('span'): for text_line in modifier.stripped_strings: description.append(text_line) item['description'] = description except AttributeError: print('ERROR parsing item', item) item['description'] = [] finally: items.append(item) document.decompose() return items
class ReviewExtractor(object): """ Wraps an XML parser to extract data particularly from each review in a Goodreads review.xml file. Uses BeautifulSoup to simply parsing. """ def __init__(self, path): self.path = path self.stream = None self.soup = None def open(self): self.stream = open(self.path, 'rb') self.soup = BeautifulSoup(self.stream, 'xml') def close(self): if self.stream: self.stream.close() # Release file handle if self.soup: self.soup.decompose() # Drop the XML out of memory self.stream = None # Force garbage collection self.soup = None # Force garbage collection def __enter__(self): """ Open a stream to the wrapped xml file and return the extractor for use in contextual with ... as statements (and ensure close). """ self.open() return self def __exit__(self, type, value, tb): """ Ensure any open streams are closed before exiting a context block. """ self.close() def __iter__(self): if not self.soup: raise Exception("No handle to an xml soup object!") for review in self.soup.find_all('review'): yield Review(review)
def get_newznab_categories(self): """ Uses the newznab provider url and apikey to get the capabilities. Makes use of the default newznab caps param. e.a. http://yournewznab/api?t=caps&apikey=skdfiw7823sdkdsfjsfk Returns a tuple with (succes or not, array with dicts [{"id": "5070", "name": "Anime"}, {"id": "5080", "name": "Documentary"}, {"id": "5020", "name": "Foreign"}...etc}], error message) """ return_categories = [] if not self._check_auth(): return False, return_categories, "Provider requires auth and your key is not set" params = {"t": "caps"} if self.needs_auth and self.key: params['apikey'] = self.key url = posixpath.join(self.url, 'api?') + urlencode(params) data = self.get_url(url) if not data: error_string = u"Error getting xml for [%s]" % url logger.log(error_string, logger.WARNING) return False, return_categories, error_string data = BeautifulSoup(data, 'html5lib') if not (self._checkAuthFromData(data) and data.caps and data.caps.categories): data.decompose() error_string = u"Error parsing xml for [%s]" % self.name logger.log(error_string, logger.DEBUG) return False, return_categories, error_string for category in data.caps.categories.find_all('category'): if category.attrs and 'TV' in category.attrs.get('name', '') and category.attrs.get('id', ''): return_categories.append({'id': category.attrs['id'], 'name': category.attrs['name']}) for subcat in category.find_all('subcat'): if subcat.attrs and subcat.attrs.get('name', '') and subcat.attrs.get('id', ''): return_categories.append({'id': subcat.attrs['id'], 'name': subcat.attrs['name']}) data.decompose() return True, return_categories, ""
def get_newznab_categories(self): """ Uses the newznab provider url and apikey to get the capabilities. Makes use of the default newznab caps param. e.a. http://yournewznab/api?t=caps&apikey=skdfiw7823sdkdsfjsfk Returns a tuple with (succes or not, array with dicts [{"id": "5070", "name": "Anime"}, {"id": "5080", "name": "Documentary"}, {"id": "5020", "name": "Foreign"}...etc}], error message) """ return_categories = [] if not self._check_auth(): return False, return_categories, "Provider requires auth and your key is not set" params = {"t": "caps"} if self.needs_auth and self.key: params["apikey"] = self.key url = ek(os.path.join, self.url, "api?") + urllib.urlencode(params) data = self.get_url(url) if not data: error_string = u"Error getting xml for [%s]" % url logger.log(error_string, logger.WARNING) return False, return_categories, error_string data = BeautifulSoup(data, "html5lib") if not self._checkAuthFromData(data) and data.caps and data.caps.categories: data.decompose() error_string = u"Error parsing xml for [%s]" % self.name logger.log(error_string, logger.DEBUG) return False, return_categories, error_string for category in data.caps.categories.findAll("category"): if hasattr(category, "attrs") and "TV" in category.attrs["name"]: return_categories.append({"id": category.attrs["id"], "name": category.attrs["name"]}) for subcat in category.findAll("subcat"): return_categories.append({"id": subcat.attrs["id"], "name": subcat.attrs["name"]}) data.decompose() return True, return_categories, ""
def process(f): p = ET.Element('paper') fire = open(f) soup = BeautifulSoup(fire) dirname = os.path.dirname(f) + '/' outdir = dirname.replace('Nature', 'Nature_Processed') outfile = os.path.join(outdir, f.replace(dirname, '') + '.xml') try: t = ET.SubElement(p, 'title') t.text = soup.find(partial(get, "citation_title"))['content'] dt = ET.SubElement(p, 'date') dt.text = soup.find(partial(get, "citation_date"))['content'] d = ET.SubElement(p, 'doi') d.text = soup.find(partial(get, "citation_doi"))['content'][4:] a = ET.SubElement(p, 'authors') authors = None authors = soup.find_all(partial(get, "DC.creator")) if authors == []: authors = soup.find_all(partial(get, "dc.creator")) if authors == []: authors = soup.find(partial(get, "citation_authors"))['content'] authors = [x.strip() for x in authors.split(',')] for aut in authors: au = ET.SubElement(a, 'author') au.text = aut['content'] except: fire.close() total_failures.append(f) print "Basics wrong." return k = ET.SubElement(p, 'keywords') try: keywords = soup.find(partial(get, 'keywords'))['content'].split(',') keywords = map(lambda x: x.strip(), keywords) for keyword in keywords: if 'nature' in keyword: continue kwd = ET.SubElement(k, 'keyword') kwd.text = keyword except: pass k2 = ET.SubElement(p, 'article-keywords') keywords = soup.find(class_='article-keywords') if keywords is None: keywords = soup.find(class_="category") if keywords is not None: for keyword in [x for x in re.split('\s*', keywords.text) if x != '']: if ':' in keyword: continue kdw = ET.SubElement(k2, 'keyword') kdw.text = keyword.strip() ab = ET.SubElement(p, 'abstract') try: ab.text = soup.find(id="abs").text except: try: ab.text = soup.find(id="abstract").text except: ab.text = '' refs = soup.find_all(is_bib) r_tag = ET.SubElement(p, 'has-references') if refs != []: r_tag.text = 'Y' else: r_tag.text = 'N' r = ET.SubElement(p, 'references') for ref in refs: rf = ET.SubElement(r, 'reference') rft = ET.SubElement(rf, 'title') rfa = ET.SubElement(rf, 'authors') rfj = ET.SubElement(rf, 'journal') rfy = ET.SubElement(rf, 'year') rfd = ET.SubElement(rf, 'doi') rfu = ET.SubElement(rf, 'url') ref_sp = ref.text.strip() if ref_sp[0] == '.': ref_sp = ref_sp[1:].strip() authors_0 = [] next_index = 0 try: while True: if ref_sp[next_index] in [',','.',':',';']: authors_0.append(ref_sp[0:next_index]) if ref_sp[next_index] in ['.',':',';']: ref_sp = ref_sp[(next_index + 1):].strip() break ref_sp = ref_sp[(next_index + 1):].strip() next_index = 0 else: next_index += 1 except: failures_0.append(f) continue # at this point we have removed the names journal_0 = ref.find(class_='journal') if journal_0 is None: failures.append(f) continue journal_0 = journal_0.text ind = ref_sp.find(journal_0) title_0 = ref_sp[0:ind] ref_sp = ref_sp[ind:] mtch = year_regex.search(ref_sp) try: year_0 = mtch.group(0)[:-1] except: year_0 = "-1" doi_0 = '' url_0 = '' links = ref.find_all(class_='reftxt') for link in links: mtch = doi_regex.search(link['href']) if mtch is None: continue else: doi_0 = mtch.group(0) break if doi_0 == '': try: url_0 = links[0]['href'] except: pass rft.text = title_0 rfj.text = journal_0 rfy.text = year_0 rfd.text = doi_0 rfu.text = url_0 for a_0 in authors_0: if 'et al' in a_0: continue rfau = ET.SubElement(rfa, 'author') rfau.text = a_0 tree = ET.ElementTree(p) if not os.path.exists(outdir): os.makedirs(outdir) tree.write(outfile, pretty_print=True) fire.close() soup.decompose() tree = soup = None gc.collect()
def getwords(page, mdict, words, dref): pgc = SoupStrainer('dl') dl = BeautifulSoup(page, parse_only=pgc).dl formatcontent(dl) for a in dl.find_all('a', href=re.compile('http://www.etymonline.com/[^\.]+\.php$')): href = a['href'] p = re.compile(r'/([^\.]+)\.php', re.I) m = p.search(href) assert m word = ''.join(['appendix-', m.group(1)]) a['href'] = ''.join(['entry://', word]) if not word in dref: print href dref[word] = None worddef = makeappdx(getpage(href, '')) words.append([word, worddef]) dts = dl.find_all('dt') l = len(dts) dds = dl.find_all('dd') assert l==len(dds) for i in xrange(0, l): word = dts[i].a.string.strip() dd = dds[i] dd.name = 'div' dd['class'] = 'FRe' worddef = cleansp(dd.encode('utf8')) pos = word.find('(') prop = None if pos > 0: p = re.compile(r'\(((?:[a-zA-Z \,\.]+?)?)[\.,]?(\d*)\.?\)', re.I) m = p.search(word[pos:].replace('./', '., ')) assert m prop = m.group(1).rstrip() if prop: prop += '.' worddef = [m.group(2), worddef] word = word[:pos].rstrip() if word in mdict: idx = mdict[word] df = words[idx][1] if isinstance(df, OrderedDict): if prop in df: df[prop].append(worddef) else: df[prop] = [worddef] else: if prop: od = OrderedDict() od[''] = [['', '<div class="tHO"></div>'.join([df, ''])]] od[prop] = [worddef] words[idx][1] = od else: words[idx][1] = '<div class="tHO"></div>'.join([df, worddef]) else: mdict[word] = len(words) if prop!=None: od = OrderedDict() od[prop] = [worddef] words.append([word, od]) else: words.append([word, worddef]) dl.decompose()