def get_movies_near_you(self): """ Returns the list of movies playing near you, including the theaters and showtimes for each movie """ soup = get_soup('/showtimes/') list_movies = soup.find_all(class_='list_item') movies = [] for item in list_movies: movie = {} page = item.find('a')['href'] soup = get_soup(page) movie_info = soup.select('.article > .article .overview-top')[0] # fill in the movie info movie['title'] = movie_info.h4.a.text movie['url'] = movie_info.h4.a['href'] movie['runtime'] = movie_info.p.time.text movie['ratingValue'] = movie_info.select( '.rating_txt meta[itemprop="ratingValue"]')[0]['content'] movie['description'] = movie_info.select('.outline')[0]\ .text.strip() # add the list of theaters where the movie is playing movie['theaters'] = [] list_theaters = soup.find_all(class_='list_item') for item in list_theaters: theater = {} theater['name'] = item.select('h3 > a > span')[0].text # parse address anc contact info properties = [ ('address', 'streetAddress'), ('city', 'addressLocality'), ('postalCode', 'postalCode'), ('phone', 'telephone'), ] for obj_prop, item_prop in properties: theater[obj_prop] = item.select( '.address span[itemprop="%s"]' % item_prop )[0].text # add today's showtimes showtimes = item.find(class_='showtimes').select( 'meta[itemprop="startDate"]') theater['showtimes'] = [x['content'] for x in showtimes] movie['theaters'].append(theater) movies.append(movie) return movies
def get_articles(self, issue=''): """ Yields a list of articles from the given issue. """ soup = get_soup() # get soup of all articles issues = soup.find_all('ul') # validating and assigning default value for issue if not type(issue) is int or issue < 0 : issue = 1 if issue > len(issues): issue = len(issues) # considering latest article is last element articles = issues[len(issues)-issue].find_all('a') mArticles = [] for article in articles: mArticle = {} mArticle['link'] = article.get('href')[1:] mArticle['title'] = article.find('li').contents[0].strip() mArticle['author'] = article.find('span').contents[0].encode('utf8') mArticles.append(mArticle) return mArticles
def get_weekly_summary(self, alias): url = self.BOMURL + "/?page=weekly&id=" + self.alias + ".htm" soup = utils.get_soup(url) if soup is not None: return movie.Weekly(soup) else: print "Not able to parse url: " + url pass
def get_weekly_summary(self, url_or_id): if 'http' in url_or_id.lower(): soup = utils.get_soup(url_or_id) if soup is not None: return movie.Weekly(soup) else: print "Not able to parse url: " + url_or_id pass elif url_or_id in self.movie_urls.keys(): url = self.BOMURL + "/?page=weekly&id=" + url_or_id + ".htm" soup = utils.get_soup(url) if soup is not None: return movie.Weekly(soup) else: print "Not able to parse url: " + url pass else: print "Invalid movie name or URL ", url_or_id
def get_posts(self,trait='',limit=15): if limit == None or limit < 1 or limit > 30: #validate limit limit = 15 if trait == 'trending' or trait not in ['latest','must-read','discussions','jobs','companies']: trait = '' posts = 0 #fetch limit posts from the trait page soup = get_soup('')
def get_movie_summary(self, alias): self.alias = alias url = self.BOMURL + "/?page=main&id=" + self.alias + ".htm" soup = utils.get_soup(url) if soup is not None: return movie.Movie(soup) else: print "Not able to parse url: " + url pass
def from_post_id(self,post_id): soup = get_soup(page = post_id) title = soup.find('h1',class_='title post-item-title').find('a').contents[0] url = soup.find('h1',class_='title post-item-title').find('a').get('href') date = soup.find('span',class_='post-item-info').contents[0].split('in')[0].strip() category = Category.from_soup(soup.find('span',class_='post-item-info')) # todo = do this with regexp author_id = soup.find('a',{'rel':'author'}).get('href').split('/')[-2] votes = soup.find('div',class_='score2').find('p').contents[0] # todo = comments comments = '' return Post(post_id,title,url,date,category,author_id,votes,comments)
def get(self, cik, year): filings = get_annual_sd_filings_from_cik(cik) url = filings[year]['url'] if not url: return {} soup = get_soup(url) if not soup: return {} meta_dict = get_meta_info_from_soup(soup) meta_dict.update({ 'req_cik': cik, 'req_year': year }) docs = self._get_docs_from_soup(soup, meta_dict) return docs
def search_movie(self, query): """ Returns the list of results of a movie search, each one containing the image, title and link to its imdb page """ soup = get_soup(SEARCH_MOVIE_TITLE, {'q': query}) results = [] for item in soup.find_all(class_="findResult"): result = {} result['text'] = item.find('td', class_="result_text").text.strip() result['url'] = item.find('td', class_="result_text").a['href'] result['image'] = item.find('td', class_="primary_photo").a.img['src'] results.append(result) return results
def _category_search(self, category_url, query, lucky): """ Returns a list of results of a category search on imdb. Each result consists of an dictionary containing the primary image, title, link to the result's imdb page and its id. """ results = [] soup = get_soup(category_url, {'q': query}) if lucky: # returns only the first search result first = soup.find(class_="findResult") results.append(self._search_title_results_parser(first)) else: for item in soup.find_all(class_="findResult"): results.append(self._search_title_results_parser(item)) return results
def _parse_urls_from_page(base_url, page): url_patterns = ('a[href^=https://news.naver.com/main/read.nhn?]', 'a[href^=https://entertain.naver.com/main/read.nhn?]', 'a[href^=https://sports.news.naver.com/sports/index.nhn?]', 'a[href^=https://news.naver.com/sports/index.nhn?]') urls_in_page = set() page_url = '{}&start={}&refresh_start=0'.format(base_url, 1 + 10*(page-1)) soup = get_soup(page_url) if not soup: return urls_in_page try: article_blocks = soup.select('ul[class=type01]')[0] for pattern in url_patterns: article_urls = [link['href'] for link in article_blocks.select(pattern)] urls_in_page.update(article_urls) except Exception as e: raise ValueError('Failed to extract urls from page %s' % str(e)) return urls_in_page
def save_and_upload_language(language_link, num_workers=1): """ Iterates through the RawWikiLinks data for the given language and uploads each file to a GCS bucket. Parameters ---------- language_link : str Link to the files storing the RawWikiLinks data for a single language. num_workers : int, optional Number of workers to use in the ThreadPool for parallelization, defaults to 1. """ soup = utils.get_soup(language_link) file_names = soup.find_all("a", href=lambda tag: tag.endswith(".csv.gz")) if verbose: print("{} files to download...".format(len(file_names))) with ThreadPoolExecutor(max_workers=num_workers) as executor: for file_name in file_names: download_link = language_link + file_name.get("href") executor.submit(save_and_upload_file, (download_link))
def get_station_data(station): station_path = settings.SRC[0] + station['station_href'] station_soup = get_soup(station_path) if station_soup: description = station_soup.find('div', { 'class': 's-property-content' }).find('p').text lat = description.split('широта ')[1].replace('\n', '').split('°')[0] lon = description.split('долгота ')[1].replace('\n', '').split('°')[0] address = description.split('адресу ')[1].split(', вы')[0] fuel_headers = station_soup.findAll( 'span', {'class': 'col-xs-6 col-sm-4 col-md-4 add-d-title'}) fuel_details = station_soup.findAll( 'span', {'class': 'col-xs-6 col-sm-8 col-md-8 add-d-entry'}) station_fuel = {} for index in range(len(fuel_headers)): key = fuel_headers[index].text try: date_last_updated = date_translate( fuel_details[index].text.strip().split( 'обновлено ')[1].split(' г.)')[0]) date_last_updated = int( time.mktime( datetime.datetime.strptime(date_last_updated, "%d %m %Y").timetuple())) except (IndexError): date_last_updated = None station_fuel[key] = dict( cost=fuel_details[index].text.strip().split(' ')[0], updated=date_last_updated) return dict(fuel=station_fuel, href=station['station_href'], name=station['station'], region=station['region'], city=station['city'], network=station['network'], address=address, lat=lat, lon=lon) return False
def get_direct_video_url(gogo_url): soup = utils.get_soup(gogo_url) if not soup: outputs.error_info("The video doesn't exist.") raise SystemExit iframe = soup.find('iframe') if not iframe: outputs.error_info("The video doesn't exist.") raise SystemExit php_l = iframe['src'] ajx_l = ajax_t.substitute(q=php_l.split('?')[1]) r = requests.get(ajx_l) try: link = json.loads(r.text)['source_bk'][0]['file'] except (IndexError, KeyError, TypeError) as e: outputs.error_info('Unexpected error while obtaining stream url.') outputs.error_info(f'ERR: {e}') raise SystemExit _, ext = os.path.splitext(link) if ext == '.m3u8': link = utils.get_m3u8_stream(link) return link, ext
def _get_urls_from_breaking_news(self): import time base_url = 'http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1={}&date={}&page={}' yymmdd = self.year + self.month + self.date links_in_all_sections = set() for sid1 in self.sid1_list: links_in_a_section = set() last_links = set() page = 1 while page < 1000: url = base_url.format(sid1, yymmdd, page) soup = get_soup(url) links = soup.select('div[class^=list] a[href^=http]') links = [link.attrs.get('href', '') for link in links] links = {link for link in links if 'naver.com' in link and 'read.nhn?' in link} if last_links == links: break links_in_a_section.update(links) last_links = {link for link in links} if self.verbose: print('\rpage = {}, links = {}'.format(page, len(links_in_a_section)), flush=True, end='') page += 1 if self.debug and page >= 3: break time.sleep(SLEEP) links_in_all_sections.update(links_in_a_section) if self.verbose: print('\rsection = {}, links = {}'.format(sid1, len(links_in_a_section))) print('date={} has {} news'.format(yymmdd, len(links_in_all_sections))) return links_in_all_sections
def scrap_one_article(article_data): url_poynter_article = article_data['uri'] soup_poynter_article = get_soup(url_poynter_article) for p in soup_poynter_article.main.article.find_all('p'): t = p.get_text() if 'Explanation: ' in t: explanation = t.replace('Explanation: ', '') elif 'originated from:' in t: origin = t.split('originated from: ')[-1] elif 'Fact-checked by' in t: checker = t.replace('Fact-checked by: ', '') elif '--topinfo' in p.get('class')[-1]: date, country = t.split(' | ') for a in soup_poynter_article.main.article.find_all('a'): if isinstance(a, NavigableString): continue elif 'Read the Full Article' in a.get_text(): url_source_article = a.get('href') article_data['extraMeta']['explanation'] = clean_text(explanation) article_data['extraMeta']['origin'] = clean_text(origin) article_data['extraMeta']['checker'] = clean_text(checker) article_data['extraMeta']['date'] = date article_data['extraMeta']['country'] = clean_text(country) article_data['extraMeta']['url_source'] = url_source_article """ language = get_language(url_source_article) article_data['language'] = language if language == 'english': soup = get_soup(url_source_article) content = soup.main.find(id='content') for p in content: if isinstance(p, NavigableString): pass else: body_text = p.get_text() else: body_text = 'NONE-{}'.format(language)""" return article_data
def get_building_info(url): """Get building info. Args: url (str): url. Returns: dict: results. """ results = list() soup = get_soup(url) if not soup: logging.warning('Cannot parse %s', url) return list() items = soup.find_all('span', {'class': 'f12a6'}) if items: for item in items: item = item.next_sibling.next_sibling url = 'http://bjjs.zjw.beijing.gov.cn' + item.get('href') house_info = get_house_info(url) house_info['url'] = url results.append(house_info) return results
def _get_article(link): try: link = '{}{}'.format(conf.MAIN_LINK, link) main_soup = get_soup(link) news_text = main_soup.find("div", attrs={ 'class': 'tn-news-text' }).text.replace('\n', ' ').replace('\r', '') news_title = main_soup.find("h1", attrs={ 'class': 'tn-content-title' }).text.replace('\n', ' ').replace('\r', '') datetime = main_soup.find("li", attrs={ "class": "tn-hidden@t" }).text.replace('\n', ' ').replace('\r', '') view_count = _get_view_count(link) except: news_text = 'unknown' news_title = 'unknown' view_count = 0 datetime = dt.datetime.now() return news_text, news_title, datetime, view_count
def get_supplier_spg_df(supplier_path=SUPPLIER_PATH, spg_path=SPG_PATH): """ supplier_df and spg_df have N:M relationship. Create a join table between them. :return: supplier_spg_df, has columns=['supplier_spg_id', 'supplier_id', 'spg_id'] """ supplier_df = pd.read_excel(supplier_path)[[ 'supplier_id', 'supplier_url', 'supplier_code' ]] supplier_df = supplier_df[pd.notnull(supplier_df['supplier_code'])] supplier_spg_df = pd.DataFrame(columns=['spg_url_key', 'supplier_id']) spg_df = pd.read_excel(spg_path)[['spg_url_key', 'spg_id']] for supplier_id_url in supplier_df.itertuples(index=True, name='Pandas'): supplier_id = getattr(supplier_id_url, 'supplier_id') supplier_url = getattr(supplier_id_url, 'supplier_url') _, supplier_soup = get_soup(supplier_url) all_li = supplier_soup.find('table', attrs={ 'id': 'table_arw_wrapper' }).find_all('li') temp_spg_list = [li.find('a')['href'].split('/')[-2] for li in all_li] temp_spg_df = pd.DataFrame(temp_spg_list, columns=['spg_url_key']) temp_spg_df['supplier_id'] = supplier_id supplier_spg_df = supplier_spg_df.append(temp_spg_df) time.sleep(0.1) supplier_spg_df = supplier_spg_df.merge(spg_df, on='spg_url_key', how='left') supplier_spg_df.drop(columns=['spg_url_key']) supplier_spg_df.index += 1 supplier_spg_df['supplier_spg_id'] = supplier_spg_df.index return supplier_spg_df
def get_author_papers_cluster_id(author_google_id): papers_cluster_ids = set() MAX_PAGES = 5 for page in range(1, MAX_PAGES + 1): try: logger.debug( "Get author page #{} with papers from indexes [{}:{}].".format( page, (page - 1) * 100 + 1, page * 100 + 1)) url = _FULLURL.format( _HOST, _AUTHOR_PAPERS_PAGE.format(author_google_id, (page - 1) * 100 + 1)) soup = utils.get_soup(url) if soup is None: logger.debug( "Soup for author page URL='{0}' is None.".format(url)) return None page_papers_counter = soup.find('span', id='gsc_a_nn') logger.debug("Papers counter on page: {}".format( "not found. It's last page." if not page_papers_counter else page_papers_counter.text)) for paper_info in soup.find_all("a", "gsc_a_ac gs_ibl"): href = paper_info["href"] if href: id = re.findall(r'\d+', href.strip()) if id: papers_cluster_ids.add(id[0]) if not page_papers_counter or int( page_papers_counter.text.split("–")[1]) < page * 100 + 1: break except KeyboardInterrupt: raise except BaseException: logger.warn(traceback.format_exc()) logger.debug("Found paprs with cluster id: {}".format( len(papers_cluster_ids))) return papers_cluster_ids
def get_pg_df(product_index_url=PRODUCT_INDEX_URL): """ Parse PRODUCT_INDEX_URL page and build a DataFrame of product group data. :return: pg_df, product group DataFrame, has columns=['product_group', 'pg_url', 'pg_url_key', 'pg_id] """ _, product_index_soup = get_soup(product_index_url) pg_regex = re.compile('catfiltertopitem.*') pg_h2 = product_index_soup.find_all('h2', attrs={'class': pg_regex}) pg_list = [] for h2 in pg_h2: anchor = h2.find('a') product_group = anchor.text.replace('/', '_') pg_url = DIGIKEY_HOME_PAGE + anchor['href'] pg_url_key = anchor['href'].split('/')[-2] pg_list.append([product_group, pg_url, pg_url_key]) pg_df = pd.DataFrame(pg_list, columns=['product_group', 'pg_url', 'pg_url_key']) pg_df.index += 1 pg_df['pg_id'] = pg_df.index return pg_df
def _get_link(PAGE_NUM): PAGE_LINK = "https://tengrinews.kz/news/page/{}/".format(PAGE_NUM) soup = get_soup(PAGE_LINK) link_raw = soup.findAll("a", attrs={"class", "tn-link"}) links = [i["href"] for i in link_raw] return links
height, width = 256, 256 N_MAX_IMAGES = 50 for folder_name in os.listdir('_data/ingredients'): print('Looking for pictures of ', folder_name) query = folder_name.split() query = '+'.join(query) url = 'https://www.google.co.in/search?q=' + query + '&source=lnms&tbm=isch' print(url) # add the directory for your image here ROOT_DIRECTORY = '_data_pictures' soup = get_soup(url) actual_images = []# contains the link for Large original images, type of image for a in soup.find_all("div", {"class":"rg_meta"}): link, Type = json.loads(a.text)["ou"], json.loads(a.text)["ity"] actual_images.append((link, Type)) print('There is a total of ' , len(actual_images), 'images') query_directory = os.path.join(ROOT_DIRECTORY, folder_name) if not os.path.exists(query_directory): os.makedirs(query_directory) for i, (img_url, Type) in enumerate(actual_images): if i > N_MAX_IMAGES: break
import bs4 import json import requests import time from utils import (get_content, get_soup, save_json, load_json) MANGA_SEARCH_URL = 'https://myanimelist.net/manga.php?type=1&q=' # load series information all_series = load_json("data.json") for series in all_series: # search on MyAnimeList query_soup = get_soup(get_content(MANGA_SEARCH_URL + series['name'])) time.sleep(15) # rate limiting table_row_tag = query_soup.find('div', class_='js-categories-seasonal').tr.next_sibling link_tag = table_row_tag.find('a', class_='hoverinfo_trigger fw-b') # series name in english name_en = link_tag.strong.text print(f'{series["name"]} | {name_en}') # parse series page info_url = link_tag['href'] info_soup = get_soup(get_content(info_url)) time.sleep(15) # rate limiting container = info_soup.find('div', class_='js-scrollfix-bottom')
def get(self, ticker_id): recent_8k = self.get_the_most_recent_8k(ticker_id) soup = get_soup(recent_8k['link']) info = get_meta_info_from_soup(soup) return info
def shonenjump(): # create image directory IMG_DIR = 'img' create_dir(IMG_DIR) rensai_soup = get_soup(get_content(RENSAI_URL)) archives_soup = get_soup(get_content(ARCHIVES_URL)) # store series information: name, abbreviated name and whether it is still ongoing all_series = [] # create icon directory ICONS_DIR = os.path.join(IMG_DIR, 'icons') create_dir(ICONS_DIR) for soup in [rensai_soup, archives_soup]: # ongoing series? ongoing = True if soup is rensai_soup else False section = soup.find('section', class_='serialSeries') for li in section.find_all('li'): # series name in japanese name_jp = li.div.text if li.div else li.p.text name_jp = name_jp[1:name_jp.find('』')] link_tag = li.a # abbreviated name abbr = link_tag['href'].rsplit('/', 1)[1][:-5] # download icon img_src = link_tag.img['src'] img_url = BASE_URL + img_src file_path = os.path.join(ICONS_DIR, abbr + '.' + img_src.rsplit('.', 1)[1]) print(f'Downloading {file_path}...') write_image(img_url, file_path) # add series series = { 'name': name_jp, 'abbr': abbr, 'ongoing': ongoing } all_series.append(series) # save series information save_json("data.json", all_series) for series in all_series: # create directory for this series series_dir = os.path.join(IMG_DIR, series['abbr']) create_dir(series_dir) current_list_url = LIST_URL + series['abbr'] + '.html' while current_list_url: list_soup = get_soup(get_content(current_list_url)) ul = list_soup.find('ul', class_='comicsList') # ignore series that hasn't release any volume yet if ul.li is None: break for dl in ul.select('li dl'): # skip current volume if it isn't released yet if '発売予定' in str(dl.p): continue # download cover img_src = dl.img['src'] img_url = BASE_URL + img_src file_path = os.path.join(series_dir, img_src.rsplit('/', 1)[1]) print(f'Downloading {file_path}...') write_image(img_url, file_path) # get url for next list of covers next_list_url_tag = list_soup.find('span', class_='current_page').next_sibling.next_sibling if next_list_url_tag is None: break else: current_list_url = BASE_URL + next_list_url_tag['href']
def __init__(self, cislo): self.cislo = cislo self.url = ("https://www.nrsr.sk/web/Default.aspx?sid=zakony/zakon" "&ZakZborID=13&CisObdobia=7&CPT={}".format(self.cislo)) self.soup = utils.get_soup(self.url) self.data = {}
def __init__(self): """Initialize the base url and get the base soup.""" self.base_url = "http://www.demagog.sk/politici" self.base_soup = utils.get_soup(self.base_url)
import datetime import pandas as pd from sqlalchemy import create_engine from utils import get_date_list, get_soup, extract_soup_data, process_dataframes, clean_and_format_dataframe from playlist_types import playlist_types basdir = os.path.abspath(os.path.dirname(__file__)) engine = create_engine(os.environ.get('DATABASE_URL') or f'sqlite:///{os.path.join(basdir, "app.sqlite")}') dates = get_date_list() query_date = '2019/08/03' for query_date in dates: soup = get_soup(query_date) tables_html, tables_dfs, accordions = extract_soup_data(soup) tables_dfs = process_dataframes(tables_html, tables_dfs, accordions) # make one df from all the shows df = pd.concat(tables_dfs, ignore_index=True) del df['Buy CD'] df.rename(columns={'Unnamed: 0': 'time', 'Title': 'title', 'Composer': 'composer', 'Performers': 'performers', 'Record Co.Catalog No.': 'record_co_catalog_no', }, inplace=True)
def __init__(self, cislo): self.cislo = cislo self.url = ("https://www.nrsr.sk/web/Default.aspx?" "sid=schodze/nrepdn_detail&id={}".format(self.cislo)) self.soup = utils.get_soup(self.url)
def _cluster_handler(cluster_id, papers_count): logger.debug("Handle %i papers from cluster %s." % (papers_count, cluster_id)) url = _FULLURL.format(_HOST, _SCHOLARCLUSTER.format(cluster_id)) logger.debug("Get cluster page URL='{0}'.".format(url)) soup = utils.get_soup(url) #utils.soup2file(soup, "D:\A.html") # This dictionary contains info about unique papers EndNote_list = list() file_counter = 0 merged_counter = 0 # return true if EndNote_1 equal EndNote_2 def is_EndNote_equal(EndNote_1, EndNote_2): return \ EndNote_1["title"].lower() == EndNote_2["title"].lower() and \ ( not "year" in EndNote_1 or not "year" in EndNote_2 or EndNote_1["year"] == EndNote_2["year"] ) \ and len(EndNote_1["author"]) == len(EndNote_1["author"]) \ and EndNote_1["type"] == EndNote_2["type"] and \ ( not "pages" in EndNote_1 or not "pages" in EndNote_2 or EndNote_1["pages"] == EndNote_2["pages"] ) # return list of similar papers (maybe empty) def intersect_papers(EndNote_data, EndNote_list): return \ [i for i in EndNote_list if is_EndNote_equal(EndNote_data, i)] # Loop on pages while True: if soup is None: logger.debug( "Soup for cluster page URL='{0}' is None.".format(url)) return None # This list contains links to EndNote and cited by count for each paper # in cluster logger.debug("Find EndNote links for each paper in cluster.") footer_links = [{ "EndNote" if "EndNote" in link.text else "citedby": link["href"].strip() if "EndNote" in link.text else int( re.findall(r'\d+', link.text)[0]) for link in paper_block.find("div", class_="gs_fl").find_all('a') if "EndNote" in link.text or "Cited" in link.text or "Цитируется" in link.text } for paper_block in soup.find_all('div', class_='gs_ri')] logger.debug( "Extract unique papers in cluster and load data from EndNote.") for links in footer_links: if links != {}: file_counter += 1 logger.debug("EndNote file #%i (total %i)" % (file_counter, papers_count)) if links.get("EndNote"): paper_EndNote_data = get_info_from_EndNote( links["EndNote"], True) else: settings.print_message( 'Error getting EndNote files. ' 'Please change the display settings Google Scholar in English ' '(https://scholar.google.com/).') logger.debug( 'End work programme because did not find link to EndNote file.' ) raise Exception('Did not find EndNote.') if paper_EndNote_data is None: logger.debug( "Skip EndNote file #%i, could not upload file." % file_counter) continue if not "year" in paper_EndNote_data or not "author" in paper_EndNote_data: logger.debug( "Skip EndNote file #%i, empty year or authors fields." % file_counter) else: similar_papers = intersect_papers(paper_EndNote_data, EndNote_list) if similar_papers == []: merged_counter += 1 logger.debug( "EndNote file #%i miss all EndNote files in merged array." % file_counter) logger.debug("Add EndNote file #%i in merged array." % file_counter) paper_EndNote_data.update({ "url_scholarbib": links["EndNote"], "citedby": links["citedby"] if "citedby" in links else None }) EndNote_list.append(paper_EndNote_data) else: similar_file = similar_papers[0] similar_file_index = EndNote_list.index(similar_file) if len(similar_file) < len(paper_EndNote_data): logger.debug( "EndNote file #{0} like #{1} EndNote file in merged array and has more fields, replace." .format(file_counter, similar_file_index + 1)) EndNote_list[ similar_file_index] = paper_EndNote_data else: logger.debug( "EndNote file #{0} like #{1} EndNote file in merged array, skipped." .format(file_counter, similar_file_index + 1)) # NEXT button on html page if soup.find(class_='gs_ico gs_ico_nav_next'): url = soup.find( class_='gs_ico gs_ico_nav_next').parent['href'].strip() logger.debug("Load next page in resulting query selection.") soup = utils.get_soup(_FULLURL.format(_HOST, url)) else: break if merged_counter == 0: logger.debug( "All %i EndNote files in the cluster are not informative. No merged files." % file_counter) else: logger.debug( "All {0} EndNote files merged in {1} (i.e. distinct versions in cluster: {1}):" .format(file_counter, merged_counter)) for counter, data in enumerate(EndNote_list): logger.debug("Merged EndNote file #%i:\n%s" % (counter + 1, data["EndNote"])) return tuple(EndNote_list)
def get_soup_of_with_h_and_gtyp(h, gtyp): url = filter_url_base.format(h, gtyp) return get_soup(url)
def parse_day_time(url): soup = get_soup(url) day_time = soup.find('em', {'class': 'date'}).text day = day_time.split()[0].replace('.', '-') time = day_time.split()[1] return day, time
def download_tab(self, url, path=''): r1 = requests.get(url, headers=self._headers) soup = get_soup(r1) text = soup.find('pre').text self._save_file(text, url, '.txt', path=path, encode='utf-8')
def get_first_thread_html(self): comments_url = self.get_first_thread_link() parsed_html = get_soup(comments_url) html_string = str(parsed_html) return html_string
#call progress bar #for item in utils.progressBar(url_dict.keys(), prefix='Category',suffix = 'Complete', length = 50): for category in url_dict.keys(): #read in links data link_name_str = f'{category}+"links.txt"' if link_name_str in os.listdir(f'./{category}'): with open(f'{os.getcwd()}+"/link_name_str"', "r+") as file: scraped_link_list = file.readlines() else: scraped_link_list = [] # get main page soup main_page_soup = utils.get_soup(url_dict[category]) # get number of pages page_count = utils.get_page_count(main_page_soup)[1] # iterate through pages and scrape links links = [] for page in range(1, page_count): # get links from page print(url_dict[category] + "?page-" + str(page)) soup = utils.get_soup(url_dict[category], "?page-" + str(page)) links = links + utils.get_page_links(soup)
def get_first_thread_link(self): parsed_html = get_soup(self.url) comments_element = parsed_html.find('a', {"class": "comments"}) comments_url = comments_element['href'] return comments_url
def _ident_and_fill_paper(soup, params): """Return paper info""" pagenum = 1 papers_count = 0 qtext = requests.utils.quote( stopwords.delete_stopwords(params["title"], " and ")) # DEBUG messages #logger.debug("Proceed stop word list for title '%s'" % params["title"]) #logger.debug("Title without stop words: '%s'" % stopwords.delete_stopwords(params["title"], " ")) #logger.debug("Title with logical conditions: '%s'" % stopwords.delete_stopwords(params["title"], " and ")) ## while True: logger.debug("Find papers on page #%i (max_researchgate_papers=%i)" % (pagenum, params["max_researchgate_papers"])) if soup.find('div', class_='search-noresults-headline') is not None: logger.debug("This paper not found in researchgate.") return None logger.debug("Parse html and get info about papers.") papers_box = soup.find_all('div', 'publication-item') logger.debug("On resulting page #%i found %i papers." % (pagenum, len(papers_box))) on_page_paper_count = 0 for papers_item in papers_box: if papers_count > params["max_researchgate_papers"]: logger.debug("This paper not found in researchgate.") return None try: on_page_paper_count += 1 papers_count += 1 # Get info about paper authors = len(papers_item.find_all("span", itemprop="name")) year = int( papers_item.find('div', class_='publication-metadata').find( 'span').text.split()[1]) title = papers_item.find( "a", class_="publication-title").text.strip().lower() type = papers_item.find( 'div', class_='publication-type').text.strip().lower() logger.debug( "Process paper #%i (title='%s'; year=%i; auth_count=%i; type='%s')" % (papers_count, title, year, authors, type)) logger.debug("Title and year check.") # First compare if params["year"] != year: logger.debug( "Year of paper #%i does not coincide with the year of the required paper, skipped." % (on_page_paper_count)) elif params["title"] != title: logger.debug( "Title of paper #%i does not coincide with the title of the required paper, skipped." % (on_page_paper_count)) # Second compare else: logger.debug( "The title and year of the paper coincided, identification of information from the RIS." ) timeout = random.uniform(0, 3) logger.debug("Sleep {0} seconds.".format(timeout)) time.sleep(timeout) paper_url = _FULLURL.format( _HOST, papers_item.find("a", class_="publication-title")["href"]) logger.debug("Process RIS for paper #%i." % on_page_paper_count) rg_paper_id = get_rg_paper_id_from_url(paper_url) info = get_info_from_RIS(rg_paper_id) if params["authors_count"] != len(info['authors']): logger.debug( "Count of author of paper #%i does not coincide with the count of author of the required paper, skipped." % (on_page_paper_count)) elif 'start_page' in info and params[ "spage"] is not None and str( params["spage"]) != info['start_page']: logger.debug( "Start page of paper #%i does not coincide with the start page of the required paper, skipped." % (on_page_paper_count)) elif 'end_page' in info and params[ "epage"] is not None and str( params["epage"]) != info['end_page']: logger.debug( "End page of paper #%i does not coincide with the end page of the required paper, skipped." % (on_page_paper_count)) else: logger.debug( "Paper #%i was identified with EndNote file #%i." % (on_page_paper_count, params["paper_version"])) logger.debug( "EndNote file #%i:\n%s" % (params["paper_version"], params["EndNote"])) logger.debug("RIS file:\n%s" % info["RIS"]) paper_url = _FULLURL.format( _HOST, papers_item.find( "a", class_="publication-title")["href"]) type = papers_item.find( 'div', class_='publication-type').text.strip().lower() info = get_paper_info_from_dataRIS(info, rg_paper_id) info.update({ "rg_type": type, "url": paper_url, }) # Get authors #logger.debug("Get authors list") #auth_list = get_authors(info["rg_id"]) # Get author info # for author in auth_list: # if author["accountId"] != None: # logger.debug("Get more info for author with rg_account_id={0}".format(author["accountId"])) # author_info = get_auth_info(author["accountId"]) # author.update(author_info) #info.update({"authors" : auth_list}) return info except Exception as error: logger.warn(traceback.format_exc()) if len(papers_box) >= 10: pagenum += 1 logger.debug("Load next page in resulting query selection.") # Delay about Delay seconds for hide 429 error. timeout = random.uniform(1, 2) logger.debug("Sleep {0} seconds.".format(timeout)) time.sleep(timeout) qtext = requests.utils.quote( stopwords.delete_stopwords(params["title"], " and ")) # DEBUG messages logger.debug("Proceed stop word list for title '%s'." % params["title"]) logger.debug("Title without stop words: '%s'." % stopwords.delete_stopwords(params["title"], " ")) logger.debug("Title with logical conditions: '%s'." % stopwords.delete_stopwords(params["title"], " and ")) # url = _PUBSEARCH.format(qtext, pagenum) soup = utils.get_soup(_FULLURL.format(_HOST, url), _PROXY_OBJ.get_cur_proxy()) else: logger.debug("This paper not found in researchgate.") return None
LISTING_IDS = get_existing_ids(EXISTING_DATA) CURRENT_IDS = LISTING_IDS.copy() if skip_ids else [] REMOVED_IDS = [] # array of objects MAIL_PRICE_CHANGES = [] MAIL_NEW_LISTINGS = [] MAIL_REMOVED_LISTINGS = [] while True: logger.info(f'--------------------------{CURRENT_PAGE}--------------------------') if TOTAL_PROFILES_COUNT and (CURRENT_PROFILES_COUNT >= TOTAL_PROFILES_COUNT): break soup = get_soup(SEARCH_URL.format(CURRENT_PAGE)) if soup: if not TOTAL_PROFILES_COUNT: TOTAL_PROFILES_COUNT = get_total_profiles(soup) profiles = get_profiles(soup) if profiles: CURRENT_PROFILES_COUNT = CURRENT_PROFILES_COUNT + len(profiles) for url in profiles: _id = get_id(url) if _id in CURRENT_IDS: continue time_gap() p_soup, profile = get_profile_data(url)
def get_community_by_region(city='bj', region_name='chaoyang', output_dir=None): """Get community by region. Args: city: str, city. region_name: str, region name. """ url = 'https://{}.ke.com/xiaoqu/{}/'.format(city, region_name) soup = get_soup(url) if not soup: return total_pages = get_total_pages(url) if not total_pages: logging.error('Finish at %s', model.Community.select().count()) return for page in range(total_pages): if page > 0: url_page = '{}pg{}/'.format(url, page) soup = get_soup(url_page) if not soup: return name_list = soup.find_all('li', {'class': 'clear'}) logging.info('%s %d / %d', region_name, page + 1, total_pages) for name in name_list: info = dict() try: item = name.find('div', {'class': 'title'}) title = item.get_text().strip('\n') logging.info('%s', title) link = item.a.get('href') info['title'] = title link = get_mobile_link(link, city) info['link'] = link info['community_id'] = name.get('data-housecode') item = name.find('a', {'class': 'district'}) if item: info['district'] = item.get_text() item = name.find('a', {'class': 'bizcircle'}) if item: info['bizcircle'] = item.get_text() item = name.find('div', {'class': 'tagList'}) if item: info['tags'] = item.get_text().strip('\n') item = name.find('a', {'class': 'totalSellCount'}) if item: info['onsale'] = item.span.get_text().strip('\n') item = name.find('a', {'title': title + u'租房'}) if item: info['onrent'] = item.get_text().strip('\n').split(u'套')[0] item = name.find('div', {'class': 'totalPrice'}) if item: info['average_unit_price'] = item.span.get_text().strip( '\n') output_path = os.path.join(output_dir, u'{}.json'.format(title)) community_info = get_community_info_by_url(link, output_path) for key, value in community_info.items(): info[key] = value except Exception as expection: logging.error(expection) logging.error(traceback.format_exc()) continue model.Community.replace(**info).execute() time.sleep(1)
def get_community_info_by_url(url, output_path=None): """Get community info by url. Args: url: str, url. Returns: dict, results. """ results = dict() soup = get_soup(url) if not soup: logging.warn('Cannot parse %s', url) return results pattern = re.compile('window.__PRELOADED_STATE__ = (.*);', re.MULTILINE | re.DOTALL) items = soup.find('script', {'type': 'text/javascript'}, text=pattern) if items: items = pattern.search(items.text).group(1) items = json.loads(items) if output_path: json.dump(items, codecs.open(output_path, 'w', 'utf-8'), indent=4, sort_keys=True, ensure_ascii=False) items = items['xiaoquDetail']['survey'] if isinstance(items, dict): items = items.values() for item in items: name = item['name'] name = model.Community.NAME_DICT[name] value = unicode(item['value']) results[name] = value # gonglueV2.html url += 'gonglueV2.html' soup = get_soup(url) if not soup: logging.warn('Cannot parse %s', url) return results items = soup.find_all('span', {'class': 'txt_gray'}) if items: for item in items: name = item.get_text().strip(u':') name = model.Community.NAME_DICT[name] if name in results: continue value = unicode(item.next_sibling) results[name] = value # intro item = soup.find('div', {'class': 'cpt_content_section'}) if item: results['intro'] = item.get_text().strip('\n') # score item = soup.find('div', {'class': 'review_score'}) if item: value = float(item.next_element) results['score'] = value item = soup.find('ul', {'class': 'review_list'}) if item: items = item.find_all('li') if items: for item in items: name = item.get_text().replace('\n', '').split() value = float(name[1].strip(u'分')) name = model.Community.NAME_DICT[name[0]] if name in results: continue results[name] = value # review item = soup.find('div', {'id': 'review_good'}) if item: value = item.get_text().strip('\n').strip(u'小区优点').strip('\n') results['good_point'] = value item = soup.find('div', {'id': 'review_bad'}) if item: value = item.get_text().strip('\n').strip(u'小区弱点').strip('\n') results['bad_point'] = value # sheshi_cell items = soup.find_all('div', {'class': 'sheshi_cell'}) if items: for item in items: name = item.p.get_text() name = model.Community.NAME_DICT[name] value = item.img['src'].split('/')[-1].split('.')[0].split('_')[-1] value = value[0] != 'n' results[name] = value return results