def _get_hashes(self, video_id, resource_ids, auth_retry=False, player_retry=False): playerVersion = self.plugin.get_setting('player_version') video_data = self._get_video_info(video_id) provider = ('globo' if video_data['channel_id'] == 196 else self.plugin.get_setting('play_provider').lower().replace( ' ', '_')) credentials = self.authenticate(provider) args = (video_id, '|'.join(resource_ids), playerVersion) data = scraper.get_page(HASH_URL % args, cookies=credentials) self.plugin.log.debug('hash requested: %s' % (HASH_URL % args)) self.plugin.log.debug('resource ids: %s' % '|'.join(resource_ids)) self.plugin.log.debug('return: %s' % repr(data).encode('ascii', 'replace')) try: return (data['hash'], data) except ValueError: msg = 'JSON not returned. Message returned:\n%s' % data self.plugin.log.error(msg) raise except KeyError: args = (data['http_status_code'], data['message']) self.plugin.log.error('request error: [%s] %s' % args) if data['message'] == 'Player not recognized': # If a 'Player not recognized' message is received, it is # either because the player version is not yet set, or it's # outdated. In either case, player version is reset and hash # computation retried once self.plugin.log.debug('reset player version') if not player_retry: playerVersion = scraper.get_player_version() self.plugin.set_setting('player_version', playerVersion) self.plugin.log.debug( 'retrying with new player version %s' % playerVersion) return self._get_hashes(video_id, resource_ids, auth_retry, True) if str(args[0]) == '403' and any(credentials.values()): # If a 403 is returned (authentication needed) and there is an # globo id, then this might be due to session expiration and a # retry with a blank id shall be tried self.plugin.log.debug('cleaning credentials') credentials_key = '%s_credentials' % ('globo' if 'globo' == provider else 'play') self.plugin.set_setting(credentials_key, '') if not auth_retry: self.plugin.log.debug('retrying authentication') return self._get_hashes(video_id, resource_ids, True, player_retry) raise Exception(data['message'])
def _get_video_info(self, video_id): # get video info data = scraper.get_page(INFO_URL % video_id)['videos'][0] if 'date' not in data: # original date is not part of INFO_URLs metadata response data['date'] = util.time_format() if 'duration' not in data: data['duration'] = sum(x['resources'][0]['duration']/1000 for x in data.get('children') or [data]) return data
def build_vocab(self): for i, app in enumerate(self.apps): doc, name = filter_out_description(get_page(app)) terms = self.process_document(doc) progressbar((i+1)/len(self.apps), name) self.documents[name] = terms term_set = set(terms) for term in term_set: self.vocab[term] = self.vocab.get(term, []) + [name] # Map each term in vocabulary to an index self.idx_map = {term: idx for idx, term in enumerate(self.vocab.keys())}
def send_nike_email(): # For Nike, we can use the API and get the Json response, so we don't need BeutifulSoup NIKE_SEARCH_ROUTE = 'https://busca.nike.com.br/busca?q=air%20force&origin=autocomplete&common_filter%5B372%5D=3257&sort=5&ajaxSearch=1' NIKE_json_response = json.loads( get_page(NIKE_SEARCH_ROUTE)) # Getting all data # Spliting data to get what we want NIKE_qty_sneaker = NIKE_json_response["totalProducts"]["totalResults"] NIKE_sneakers = NIKE_json_response["productsInfo"]["products"] # Filling email template and sending it NIKE_email_body = fill_email_template(NIKE_sneakers) send_mail(NIKE_email_body, NIKE_qty_sneaker, 1)
def send_authenticFeet_email(): # AuthenticFeet URL AF_URL = 'https://www.authenticfeet.com.br/masculino/tenis/41/air%20force?PS=24&map=c,c,specificationFilter_5,ft' # Getting all the AuthenticFeet page and dealing using BeutifulSoup AF_page = get_page(AF_URL) # Getting all data # Spliting data to get what we want AF_qty_sneaker = af_get_qty_seach(AF_page) AF_sneakers = af_get_sneakers_data(AF_page) # Filling email template and sending it AF_email_body = fill_email_template(AF_sneakers) send_mail(AF_email_body, AF_qty_sneaker, 0)
def _get_hashes(self, video_id, resource_ids, auth_retry=False, player_retry=False): playerVersion = self.plugin.get_setting('player_version') video_data = self._get_video_info(video_id) provider = ('globo' if video_data['channel_id'] == 196 else self.plugin.get_setting('play_provider').lower().replace(' ', '_')) credentials = self.authenticate(provider) args = (video_id, '|'.join(resource_ids), playerVersion) data = scraper.get_page(HASH_URL % args, cookies=credentials) self.plugin.log.debug('hash requested: %s' % (HASH_URL % args)) self.plugin.log.debug('resource ids: %s' % '|'.join(resource_ids)) self.plugin.log.debug('return: %s' % repr(data).encode('ascii', 'replace')) try: return (data['hash'], data) except ValueError: msg = 'JSON not returned. Message returned:\n%s' % data self.plugin.log.error(msg) raise except KeyError: args = (data['http_status_code'], data['message']) self.plugin.log.error('request error: [%s] %s' % args) if data['message'] == 'Player not recognized': # If a 'Player not recognized' message is received, it is # either because the player version is not yet set, or it's # outdated. In either case, player version is reset and hash # computation retried once self.plugin.log.debug('reset player version') if not player_retry: playerVersion = scraper.get_player_version() self.plugin.set_setting('player_version', playerVersion) self.plugin.log.debug('retrying with new player version %s' % playerVersion) return self._get_hashes(video_id, resource_ids, auth_retry, True) if str(args[0]) == '403' and any(credentials.values()): # If a 403 is returned (authentication needed) and there is an # globo id, then this might be due to session expiration and a # retry with a blank id shall be tried self.plugin.log.debug('cleaning credentials') credentials_key = '%s_credentials' % ('globo' if 'globo' == provider else 'play') self.plugin.set_setting(credentials_key, '') if not auth_retry: self.plugin.log.debug('retrying authentication') return self._get_hashes(video_id, resource_ids, True, player_retry) raise Exception(data['message'])
def process_page(type, category, country, day, phone): page_info = get_page(type, country, category, day) c_name = get_category_code(category, type) filename = f'db/{type}/{country}.json' create_file(filename) with open(filename, 'r+') as file_json: try: info = json.load(file_json) print('-------------------------------' + str(info) + '-------------------------------') if c_name not in info: info[c_name] = {} except json.JSONDecodeError: info = {c_name: {}} info[c_name][day] = page_info with open(f'db/{type}/{country}.json', 'w+') as file_json: json.dump(info, file_json)
def scrape(h_tag, link_tag, text_tag, target_url): """A Program to Scrape a website""" url = target_url target = scraper.create_target(target_url) page = scraper.get_page(target) soup = scraper.get_soup(page) headings = scraper.get_heading(soup, h_tag) texts = scraper.get_texts(soup, text_tag) db_functions.create_scrape(url, h_tag, text_tag, link_tag) for h in headings: db_functions.create_result(url, h_tag, h, '', link_tag) for t in texts: db_functions.create_result(url, text_tag, '', t, link_tag) links = scraper.get_links(soup, link_tag) for l in links: db_functions.create_result(url, link_tag, '', '', l) db_functions.print_scrape(url) db_functions.print_records(url)
from scraper import get_page get_page("kek") get_page("kek")
def home(): form = SearchForm() if form.validate_on_submit(): get_page(form.url.data) print(form.url.data) return render_template("home.html", form=form)