示例#1
0
    def _get_hashes(self,
                    video_id,
                    resource_ids,
                    auth_retry=False,
                    player_retry=False):
        playerVersion = self.plugin.get_setting('player_version')

        video_data = self._get_video_info(video_id)
        provider = ('globo' if video_data['channel_id'] == 196 else
                    self.plugin.get_setting('play_provider').lower().replace(
                        ' ', '_'))
        credentials = self.authenticate(provider)

        args = (video_id, '|'.join(resource_ids), playerVersion)
        data = scraper.get_page(HASH_URL % args, cookies=credentials)

        self.plugin.log.debug('hash requested: %s' % (HASH_URL % args))
        self.plugin.log.debug('resource ids: %s' % '|'.join(resource_ids))
        self.plugin.log.debug('return: %s' %
                              repr(data).encode('ascii', 'replace'))
        try:
            return (data['hash'], data)
        except ValueError:
            msg = 'JSON not returned. Message returned:\n%s' % data
            self.plugin.log.error(msg)
            raise
        except KeyError:
            args = (data['http_status_code'], data['message'])
            self.plugin.log.error('request error: [%s] %s' % args)

            if data['message'] == 'Player not recognized':
                # If a 'Player not recognized' message is received, it is
                # either because the player version is not yet set, or it's
                # outdated. In either case, player version is reset and hash
                # computation retried once
                self.plugin.log.debug('reset player version')
                if not player_retry:
                    playerVersion = scraper.get_player_version()
                    self.plugin.set_setting('player_version', playerVersion)
                    self.plugin.log.debug(
                        'retrying with new player version %s' % playerVersion)
                    return self._get_hashes(video_id, resource_ids, auth_retry,
                                            True)

            if str(args[0]) == '403' and any(credentials.values()):
                # If a 403 is returned (authentication needed) and there is an
                # globo id, then this might be due to session expiration and a
                # retry with a blank id shall be tried
                self.plugin.log.debug('cleaning credentials')
                credentials_key = '%s_credentials' % ('globo' if 'globo'
                                                      == provider else 'play')
                self.plugin.set_setting(credentials_key, '')
                if not auth_retry:
                    self.plugin.log.debug('retrying authentication')
                    return self._get_hashes(video_id, resource_ids, True,
                                            player_retry)
            raise Exception(data['message'])
示例#2
0
 def _get_video_info(self, video_id):
     # get video info
     data = scraper.get_page(INFO_URL % video_id)['videos'][0]
     if 'date' not in data:
         # original date is not part of INFO_URLs metadata response
         data['date'] = util.time_format()
     if 'duration' not in data:
         data['duration'] = sum(x['resources'][0]['duration']/1000
                                for x in data.get('children') or [data])
     return data
示例#3
0
 def _get_video_info(self, video_id):
     # get video info
     data = scraper.get_page(INFO_URL % video_id)['videos'][0]
     if 'date' not in data:
         # original date is not part of INFO_URLs metadata response
         data['date'] = util.time_format()
     if 'duration' not in data:
         data['duration'] = sum(x['resources'][0]['duration']/1000
                                for x in data.get('children') or [data])
     return data
示例#4
0
 def build_vocab(self):
     for i, app in enumerate(self.apps):
         doc, name = filter_out_description(get_page(app))
         terms = self.process_document(doc)
         progressbar((i+1)/len(self.apps), name)
         self.documents[name] = terms
         term_set = set(terms)
         for term in term_set:
             self.vocab[term] = self.vocab.get(term, []) + [name]
     
     # Map each term in vocabulary to an index
     self.idx_map = {term: idx for idx, term in enumerate(self.vocab.keys())}
示例#5
0
def send_nike_email():
    # For Nike, we can use the API and get the Json response, so we don't need BeutifulSoup
    NIKE_SEARCH_ROUTE = 'https://busca.nike.com.br/busca?q=air%20force&origin=autocomplete&common_filter%5B372%5D=3257&sort=5&ajaxSearch=1'
    NIKE_json_response = json.loads(
        get_page(NIKE_SEARCH_ROUTE))  # Getting all data

    # Spliting data to get what we want
    NIKE_qty_sneaker = NIKE_json_response["totalProducts"]["totalResults"]
    NIKE_sneakers = NIKE_json_response["productsInfo"]["products"]

    # Filling email template and sending it
    NIKE_email_body = fill_email_template(NIKE_sneakers)
    send_mail(NIKE_email_body, NIKE_qty_sneaker, 1)
示例#6
0
def send_authenticFeet_email():
    # AuthenticFeet URL
    AF_URL = 'https://www.authenticfeet.com.br/masculino/tenis/41/air%20force?PS=24&map=c,c,specificationFilter_5,ft'

    # Getting all the AuthenticFeet page and dealing using BeutifulSoup
    AF_page = get_page(AF_URL)  # Getting all data

    # Spliting data to get what we want
    AF_qty_sneaker = af_get_qty_seach(AF_page)
    AF_sneakers = af_get_sneakers_data(AF_page)

    # Filling email template and sending it
    AF_email_body = fill_email_template(AF_sneakers)
    send_mail(AF_email_body, AF_qty_sneaker, 0)
示例#7
0
    def _get_hashes(self, video_id, resource_ids, auth_retry=False, player_retry=False):
        playerVersion = self.plugin.get_setting('player_version')

        video_data = self._get_video_info(video_id)
        provider = ('globo' if video_data['channel_id'] == 196
                    else self.plugin.get_setting('play_provider').lower().replace(' ', '_'))
        credentials = self.authenticate(provider)

        args = (video_id, '|'.join(resource_ids), playerVersion)
        data = scraper.get_page(HASH_URL % args, cookies=credentials)

        self.plugin.log.debug('hash requested: %s' % (HASH_URL % args))
        self.plugin.log.debug('resource ids: %s' % '|'.join(resource_ids))
        self.plugin.log.debug('return: %s' % repr(data).encode('ascii', 'replace'))
        try:
            return (data['hash'], data)
        except ValueError:
            msg = 'JSON not returned. Message returned:\n%s' % data
            self.plugin.log.error(msg)
            raise
        except KeyError:
            args = (data['http_status_code'], data['message'])
            self.plugin.log.error('request error: [%s] %s' % args)

            if data['message'] == 'Player not recognized':
                # If a 'Player not recognized' message is received, it is
                # either because the player version is not yet set, or it's
                # outdated. In either case, player version is reset and hash
                # computation retried once
                self.plugin.log.debug('reset player version')
                if not player_retry:
                    playerVersion = scraper.get_player_version()
                    self.plugin.set_setting('player_version', playerVersion)
                    self.plugin.log.debug('retrying with new player version %s' % playerVersion)
                    return self._get_hashes(video_id, resource_ids, auth_retry, True)

            if str(args[0]) == '403' and any(credentials.values()):
                # If a 403 is returned (authentication needed) and there is an
                # globo id, then this might be due to session expiration and a
                # retry with a blank id shall be tried
                self.plugin.log.debug('cleaning credentials')
                credentials_key = '%s_credentials' % ('globo' if 'globo' == provider else 'play')
                self.plugin.set_setting(credentials_key, '')
                if not auth_retry:
                    self.plugin.log.debug('retrying authentication')
                    return self._get_hashes(video_id, resource_ids, True, player_retry)
            raise Exception(data['message'])
def process_page(type, category, country, day, phone):
    page_info = get_page(type, country, category, day)
    c_name = get_category_code(category, type)
    filename = f'db/{type}/{country}.json'
    create_file(filename)
    with open(filename, 'r+') as file_json:
        try:
            info = json.load(file_json)
            print('-------------------------------' + str(info) +
                  '-------------------------------')
            if c_name not in info:
                info[c_name] = {}
        except json.JSONDecodeError:
            info = {c_name: {}}
        info[c_name][day] = page_info
    with open(f'db/{type}/{country}.json', 'w+') as file_json:
        json.dump(info, file_json)
示例#9
0
def scrape(h_tag, link_tag, text_tag, target_url):
    """A Program to Scrape a website"""
    url = target_url
    target = scraper.create_target(target_url)
    page = scraper.get_page(target)
    soup = scraper.get_soup(page)
    headings = scraper.get_heading(soup, h_tag)
    texts = scraper.get_texts(soup, text_tag)
    db_functions.create_scrape(url, h_tag, text_tag, link_tag)
    for h in headings:
        db_functions.create_result(url, h_tag, h, '', link_tag)
    for t in texts:
        db_functions.create_result(url, text_tag, '', t, link_tag)
    links = scraper.get_links(soup, link_tag)
    for l in links:
        db_functions.create_result(url, link_tag, '', '', l)
    db_functions.print_scrape(url)
    db_functions.print_records(url)
示例#10
0
文件: test.py 项目: Ekimerton/chegg
from scraper import get_page

get_page("kek")
get_page("kek")
示例#11
0
文件: routes.py 项目: Ekimerton/chegg
def home():
    form = SearchForm()
    if form.validate_on_submit():
        get_page(form.url.data)
        print(form.url.data)
    return render_template("home.html", form=form)