Пример #1
0
def scrape_page(url):
    '''
    NOTE: this throws away any links that can't be addons (ie: assumes we're not going any deeper)
    '''
    resp = None
    links = set()
    if url.endswith('.jpg') or url.endswith('.png') or url.endswith(
            '.gif') or url.endswith('.rar'):
        return set()
    head = time_wrapper(requests.head, (url, ), t=3)
    if head:
        try:
            cl = int(head.headers['Content-Length'])
        except:
            cl = -1
        if cl < 1000000:
            resp = time_wrapper(requests.get, (url, ), t=3)
    if not resp:
        return set()
    netloc = urlparse(url).netloc.split(':')[0]
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, from_encoding=encoding)
    for link in soup.find_all('a', href=True):
        if ".zip" in link['href'] or 'github' in link['href']:
            href = link['href']
            if not href.startswith('http'):
                href = 'http://' + netloc + '/' + href
            if can_be_repo(href):
                links.add(href)
    return links
Пример #2
0
def get_all_uic_links_from_url(base_url, h=None):
    resp = requests.get(base_url, headers=headers)
    base_url = resp.url
    if is_url_end_point(base_url):
        return [], ""
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding

    soup = BeautifulSoup(resp.content, from_encoding=encoding)
    uic_link_list = []
    for link in soup.find_all('a', href=True):
        if is_url_end_point(link['href']):
            continue
        target_url = ''
        o = urlparse(link['href'])
        if "uic.edu" in o.netloc:
            target_url = link['href'].rstrip('/')
        elif not is_absolute(link['href']):
            target_url = (urllib.parse.urljoin(base_url,
                                               link['href'])).rstrip('/')
        target_url = target_url.replace("http:", "https:")

        if target_url is not '':
            uic_link_list.append(target_url)
    return list(set(uic_link_list)), h.handle(resp.text)
Пример #3
0
    def prepare_markup(self, markup, user_specified_encoding=None,
                       exclude_encodings=None,
                       document_declared_encoding=None):
        """
        :yield: A series of 4-tuples.
         (markup, encoding, declared encoding,
          has undergone character replacement)
        Each 4-tuple represents a strategy for parsing the document.
        """
        # Instead of using UnicodeDammit to convert the bytestring to
        # Unicode using different encodings, use EncodingDetector to
        # iterate over the encodings, and tell lxml to try to parse
        # the document as each one in turn.
        is_html = not self.is_xml
        if is_html:
            self.processing_instruction_class = ProcessingInstruction
        else:
            self.processing_instruction_class = XMLProcessingInstruction

        if isinstance(markup, str):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
            yield markup, None, document_declared_encoding, False

        if isinstance(markup, str):
            # No, apparently not. Convert the Unicode to UTF-8 and
            # tell lxml to parse it as UTF-8.
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)

        try_encodings = [user_specified_encoding, document_declared_encoding]
        detector = EncodingDetector(
            markup, try_encodings, is_html, exclude_encodings)
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)
def prepare_complete_links(url):

    http_regex = re.compile(r'http')
    page = requests.get(url)
    http_encoding = page.encoding if 'charset' in page.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(page.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(page.content, 'html.parser', from_encoding=encoding)
    complete_links = []
    for alink in soup.find_all('a', href=True):
        if http_regex.search(alink['href']) is not None:
            complete_links.append(alink['href'])
            print(
                http_regex.search(alink['href']).group() + "---" +
                alink['href'])
        elif 'javascript' not in alink['href'] and len(
                alink['href'].strip()) > 0:
            if alink['href'][:1] == '/':
                temp_link = TWM_DOMAIN + alink['href']
                complete_links.append(temp_link)
                print("need http" + "---" + alink['href'])
            else:
                temp_link = TWM_DOMAIN + "/" + alink['href']
                complete_links.append(temp_link)

    return list(set(complete_links))
Пример #5
0
    def from_warc(warc_record, decode_errors="replace"):
        """
        Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
        extractor.
        :return:
        """
        raw_stream = warc_record.raw_stream.read()
        encoding = None
        try:
            encoding = warc_record.http_headers.get_header('Content-Type').split(';')[1].split('=')[1]
        except:
            pass
        if not encoding:
            encoding = EncodingDetector.find_declared_encoding(raw_stream, is_html=True)
        if not encoding:
            # assume utf-8
            encoding = 'utf-8'

        try:
            html = raw_stream.decode(encoding, errors=decode_errors)
        except LookupError:
            # non-existent encoding: fallback to utf-9
            html = raw_stream.decode('utf-8', errors=decode_errors)
        if not html:
            raise EmptyResponseError()
        url = warc_record.rec_headers.get_header('WARC-Target-URI')
        download_date = warc_record.rec_headers.get_header('WARC-Date')
        article = NewsPlease.from_html(html, url=url, download_date=download_date)
        return article
Пример #6
0
def detect_encoding(data, encoding=None, fallback='latin1', is_html=False):
    '''Detect the character encoding of the data.

    Returns:
        str: The name of the codec

    Raises:
        ValueError: The codec could not be detected. This error can only
        occur if fallback is not a "lossless" codec.
    '''
    if encoding:
        encoding = normalize_codec_name(encoding)

    bs4_detector = EncodingDetector(
        data,
        override_encodings=(encoding, ) if encoding else (),
        is_html=is_html)
    candidates = itertools.chain(bs4_detector.encodings, (fallback, ))

    for candidate in candidates:
        if not candidate:
            continue

        candidate = normalize_codec_name(candidate)

        if not candidate:
            continue

        if try_decoding(data, candidate):
            return candidate

    raise ValueError('Unable to detect encoding.')
Пример #7
0
def scrape_politifact_article(story_url):
    resp = requests.get(story_url)
    http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
    return soup.find("div", "article__text").get_text()
Пример #8
0
    def from_warc(warc_record):
        """
        Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
        extractor.
        :return:
        """
        raw_stream = warc_record.raw_stream.read()
        encoding = None
        try:
            encoding = warc_record.http_headers.get_header(
                'Content-Type').split(';')[1].split('=')[1]
        except:
            pass
        if not encoding:
            encoding = EncodingDetector.find_declared_encoding(raw_stream,
                                                               is_html=True)
        if not encoding:
            # assume utf-8
            encoding = 'utf-8'

        html = raw_stream.decode(encoding)
        url = warc_record.rec_headers.get_header('WARC-Target-URI')
        download_date = warc_record.rec_headers.get_header('WARC-Date')
        article = NewsPlease.from_html(html,
                                       url=url,
                                       download_date=download_date)
        return article
Пример #9
0
def getHTML(url, verb=False):
    '''
    This function takes and url as an input and returns the corresponding
    bs4 object
    '''

    from bs4.dammit import EncodingDetector

    try:
        re = session.get(url, headers=headers, timeout=(10, 30))

    except:
        print(r'problem here')
        return (None)

    else:
        if re.status_code == 200:
            # dealing with encoding
            http_encoding = re.encoding if 'charset' in re.headers.get(
                'content-type', '').lower() else None
            html_encoding = EncodingDetector.find_declared_encoding(
                re.content, is_html=True)
            encoding = html_encoding or http_encoding

            # generating BeautifulSoup object
            bsObj = BeautifulSoup(re.content,
                                  'html5lib',
                                  from_encoding=encoding)

            if verb == True:
                print("The title of html is %s" % bsObj.title.getText())
            return (bsObj)
        else:
            return (None)
Пример #10
0
def compile_links(web_address):
    '''
    compile_links accesses a webpage at a given address,
    finds all of the links on that page, and appends certain links
    to a list called links_list.

    compile links works together with find_diffraction_files to
    get only the relevant links.

    inputs are a web address, and the list for storing links
    '''

    html_page = requests.get(web_address)
    http_encoding = html_page.encoding if 'charset' in\
        html_page.headers.get('content-type', '').lower() else None
    html_encoding =\
        EncodingDetector.find_declared_encoding(html_page.content,
                                                is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(html_page.content, from_encoding=encoding,
                         features="html.parser")
    links_list = []

    permutation_attempt = soup(text=re.compile("Now trying variations on your request:"))
    if len(permutation_attempt) is not 0:
        return links_list

    for link in soup.find_all(href=find_diffraction_files):
        links_list.append('http://rruff.geo.arizona.edu'+link['href'])

    return links_list
Пример #11
0
def top4leagues(leagueList,index): 
    rangeOfWork = team_qulfied[index]
    defaultLst = []
    temIndex = sample(range(12),rangeOfWork)
    for i in range(rangeOfWork):
        clubsIndex = randint(0,1)
        try:
            tempClubsLst = []
            url = 'https://www.worldfootball.net'+leagueList[clubsIndex]
            source = requests.get(url, headers=header)
            http_encoding = source.encoding if 'charset' in source.headers.get('content-type', '').lower() else None
            html_encoding = EncodingDetector.find_declared_encoding(source.content, is_html=True)
            encoding = html_encoding or http_encoding
            soup = BeautifulSoup(source.content, 'lxml', from_encoding=encoding)
            find_boxS= soup.find('div',class_="scrollable_tables")
            the_team_table = find_boxS.find('table', {'class':'standard_tabelle'})
            for theTeamAtag in the_team_table.find_all('a',href=True):
                if theTeamAtag.text:
                    tempClubsLst.append(theTeamAtag.text)
            y = temIndex[i]
            teamNames = tempClubsLst[y]
            defaultLst.append(teamNames)
            tempClubsLst.pop()
        except Exception as e:
            print(e)
    return defaultLst
Пример #12
0
    def getSteam(self, q, size):
        querys = q.replace(" ", "+")
        url = ('https://store.steampowered.com/search/?term=' + str(querys) +
               '&category1=998')
        resp = requests.get(url)
        http_encoding = resp.encoding if 'charset' in resp.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                                is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(resp.content,
                             from_encoding=encoding,
                             features="lxml")
        print(url)

        SteamLinkList = []

        #find links to apps
        for link in soup.find_all('a', href=re.compile('app')):
            #remove duplicates
            if (link['href'] not in SteamLinkList):
                SteamLinkList.append(link['href'])

        #remove first two irrelevant links
        return SteamLinkList[2:size + 2]
Пример #13
0
def getLinks():
    parser = 'html.parser'  # or 'lxml' (preferred) or 'html5lib', if installed
    for i in range(1,100):
        if os.path.exists('pdfs/' + str(i)):
            print(str(i),'already exists')
            continue
        resp = requests.get("https://quizbowlpackets.com/"+str(i))
        http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(resp.content, parser, from_encoding=encoding)

        links = []

        allLinks = soup.find_all('a', href=True)
        combined = [True if 'pdf' in link['href'] else False for link in allLinks]
        if not any(combined):
            print(str(i), 'doesn\'t exist')
            continue

        for link in allLinks:
            link = link['href']
            if 'Packet' in link:
                links.append(link)
        print(links)
        with open('pdfs/' + str(i),'wb') as file:
            pickle.dump(links, file)
Пример #14
0
    def get_soup(self, _page=0):
        """ scrape web-site page """

        # get request
        self.__response = self.get_request()
        if self.__verbose:
            _log.debug(f'self.__response={self.__response}')

        # get encoding
        _http_encoding = self.__response.encoding if 'charset' in self.__response.headers.get(
            'content-type', '').lower() else None
        _html_encoding = EncodingDetector.find_declared_encoding(
            self.__response.content, is_html=True)

        # get soup
        self.__soup = None
        try:
            if self.__verbose:
                _log.debug(f'Getting soup from self.__response.text')
            self.__soup = BeautifulSoup(self.__response.text,
                                        features='html5lib',
                                        from_encoding=(_html_encoding
                                                       or _http_encoding))
            if self.__verbose:
                _log.debug(f'Got soup from self.__response.text OK')
        except Exception as e:
            self.__soup = None
            if self.__verbose:
                _log.error(
                    f'Failed to get soup from self.__response.text, error={e}')
Пример #15
0
def desi_crawler(u_r_l):
    web_list = []
    url = u_r_l
    web_list.append(url)
    domain = url

    if "www." not in domain:
        div = domain.replace('//', ' ').replace('.', ' ').split()
        domain = div[1]
    else:
        div = domain.replace('//', ' ').replace('.', ' ').split()
        domain = div[2]

    for url in web_list:
        response = requests.get(url)
        http_encoding = response.encoding if 'charset' in response.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(
            response.content, is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(response.content, from_encoding=encoding)

        for link in soup.find_all('a', href=True):
            if domain in link['href']:
                if link['href'] not in web_list:
                    web_list.append(link['href'])
Пример #16
0
def otherTeams(sumOfQ):
    remainigTeams = 16- sumOfQ
    finalTeams = []
    if remainigTeams <=0:
        return []
    else:
        randomTeams = sample(range(10),remainigTeams)
        randomTeamsIndex = sample(range(remainigTeams+2),remainigTeams)
        randomTeamSelection = choices(range(len(league_qlf_list)),k=remainigTeams)
        for i in range(0,remainigTeams):
            temp_x = randomTeamSelection[i]
            try:
            	tempClubsLst = []
            	url = 'https://www.worldfootball.net'+league_qlf_list[temp_x]
            	source = requests.get(url, headers=header)
            	http_encoding = source.encoding if 'charset' in source.headers.get('content-type', '').lower() else None
            	html_encoding = EncodingDetector.find_declared_encoding(source.content, is_html=True)
            	encoding = html_encoding or http_encoding
            	soup = BeautifulSoup(source.content, 'lxml', from_encoding=encoding)
            	find_boxS= soup.find('div',class_="scrollable_tables")
            	the_team_table = find_boxS.find('table', {'class':'standard_tabelle'})
            	for theTeamAtag in the_team_table.find_all('a',href=True):
            		if theTeamAtag.text:
            			tempClubsLst.append(theTeamAtag.text)
            	y = randomTeamsIndex[i]
            	teamNames = tempClubsLst[y]
            	finalTeams.append(teamNames)
            except Exception as e:
                print(e)
        return finalTeams
Пример #17
0
    def getIMDB(self, queryi):

        url = ('https://www.imdb.com/search/keyword/?keywords=' + str(queryi) +
               '&ref_=fn_kw_kw_1&mode=detail&page=1&sort=moviemeter,asc')
        resp = requests.get(url)
        http_encoding = resp.encoding if 'charset' in resp.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                                is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(resp.content,
                             from_encoding=encoding,
                             features="lxml")
        print(url)

        imdbLinkList = []

        #find links to titles
        for link in soup.find_all('a', href=re.compile('title')):
            #remove irrelevant links
            if "vote" not in link['href'] and "search" not in link[
                    'href'] and "plotsummary" not in link['href']:
                #remove duplicates
                if ('https://www.imdb.com' + link['href'] not in imdbLinkList):
                    imdbLinkList.append('https://www.imdb.com' + link['href'])

        return imdbLinkList
Пример #18
0
def getOneEntry(searchTerm):
    searchTerm = searchTerm.replace('\n', '')
    response = requests.get(
        urlSearchTemplate.format(searchTerm.replace(' ', '%20')))

    if response.ok:
        http_encoding = response.encoding if 'charset' in response.headers.get(
            'content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(
            response.content, is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(response.content, 'lxml', from_encoding=encoding)

        result = processHtml(soup, searchTerm)

        if ("/tpl" in result[0]):
            result = getOneEntry2(result[1], result[0])

        resultSplited = result.split(',')
        if len(resultSplited) == 3:
            resultSplited = [i.decode('utf-8').strip() for i in resultSplited]
            nome = resultSplited[0]
            status = resultSplited[1]
            nome_aceito = resultSplited[1]
            return nome, status, nome_aceito
        else:
            return '', '', ''

    else:
        return 'Bad Response!'
Пример #19
0
 def doc_encoding(self) -> str:
     http_encoding = self.doc.encoding if "charset" in self.doc.headers.get(
         "Content-Type", "").lower() else None
     html_encoding = EncodingDetector.find_declared_encoding(
         self.doc.content, is_html=True)
     encoding: str = str(html_encoding or http_encoding)
     self.sdoc.encoding = encoding
     return encoding
Пример #20
0
def get_text(html):
    # Detect encoding and extract plain text from page
    encoding = EncodingDetector.find_declared_encoding(html, is_html=True)
    soup = BeautifulSoup(html, "lxml", from_encoding=encoding)
    for script in soup(["script", "style"]):
        script.extract()

    return soup.get_text(" ", strip=True)
Пример #21
0
def get_url_soup(url):
    url_request = requests.get(url, headers=headers, allow_redirects=True)
    http_encoding = url_request.encoding if 'charset' in url_request.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(
        url_request.content, is_html=True)
    encoding = html_encoding or http_encoding
    return BeautifulSoup(url_request.content, 'lxml', from_encoding=encoding)
Пример #22
0
 def get_html_title(self, page, record):
     try:
         encoding = EncodingDetector.find_declared_encoding(page,
                                                            is_html=True)
         soup = BeautifulSoup(page, "lxml", from_encoding=encoding)
         title = soup.title.string.strip()
         return title
     except:
         return ""
Пример #23
0
 def grab_projects(self, resp):
     http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
     html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
     encoding = html_encoding or http_encoding
     soup = BeautifulSoup(resp.content, from_encoding=encoding)
     links = [self.BASE_URL + link['href'] for link in soup.find_all('a', href=True) if
              link['href'].startswith("/projects/")]
     self.add_to_queue(urls=links, website_name=self.NAME)
     return len(links)
Пример #24
0
def get_html(url):
    headers = {"User-Agent": USERAGENT}
    resp = requests.get(url, headers=headers)
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    return resp.text
Пример #25
0
def get_soup_for_url(base_url):
    resp = requests.get(base_url)
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, from_encoding=encoding)
    return soup
Пример #26
0
def get_source_html(url):
    headers = {"User-Agent": 'Chrome'}
    resp = requests.get(url, headers=headers)
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    webpage = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
    return webpage
Пример #27
0
def get_soup_html(url, headers=GET_HEADER):
    resp = SESSION.get(url, headers=headers)
    http_encoding = resp.encoding if 'charset' in resp.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(resp.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding
    soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)

    return soup
Пример #28
0
 def get_html_text_body(self, page, record):
     try:
         encoding = EncodingDetector.find_declared_encoding(page,
                                                            is_html=True)
         soup = BeautifulSoup(page, "lxml", from_encoding=encoding)
         for script in soup(["script", "style"]):
             script.extract()
         return soup.get_text(" ", strip=True)
     except:
         return ""
Пример #29
0
    def prepare_markup(
        self,
        markup,
        user_specified_encoding=None,
        exclude_encodings=None,
        document_declared_encoding=None,
    ):
        """Run any preliminary steps necessary to make incoming markup
        acceptable to the parser.

        lxml really wants to get a bytestring and convert it to
        Unicode itself. So instead of using UnicodeDammit to convert
        the bytestring to Unicode using different encodings, this
        implementation uses EncodingDetector to iterate over the
        encodings, and tell lxml to try to parse the document as each
        one in turn.

        :param markup: Some markup -- hopefully a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.

        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)

         Each 4-tuple represents a strategy for converting the
         document to Unicode and parsing it. Each strategy will be tried 
         in turn.
        """
        is_html = not self.is_xml
        if is_html:
            self.processing_instruction_class = ProcessingInstruction
        else:
            self.processing_instruction_class = XMLProcessingInstruction

        if isinstance(markup, str):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
            yield markup, None, document_declared_encoding, False

        if isinstance(markup, str):
            # No, apparently not. Convert the Unicode to UTF-8 and
            # tell lxml to parse it as UTF-8.
            yield (markup.encode("utf8"), "utf8", document_declared_encoding,
                   False)

        try_encodings = [user_specified_encoding, document_declared_encoding]
        detector = EncodingDetector(markup, try_encodings, is_html,
                                    exclude_encodings)
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding,
                   False)
Пример #30
0
def getSoup(matchUrl):
    res = requests.get(matchUrl)
    res.raise_for_status()

    http_encoding = res.encoding if 'charset' in res.headers.get(
        'content-type', '').lower() else None
    html_encoding = EncodingDetector.find_declared_encoding(res.content,
                                                            is_html=True)
    encoding = html_encoding or http_encoding

    return bs4.BeautifulSoup(res.content, 'lxml', from_encoding=encoding)
Пример #31
0
def get_html_encoding(html):
	return EncodingDetector.find_declared_encoding(html, is_html=True, search_entire_document=False)