예제 #1
0
def book_href(html):
    links = [
        node.attributes.get("href")
        for node in HTMLParser(html).css("div.image_container > a")
    ]
    return links
예제 #2
0
def test_unwrap_tags():
    html_parser = HTMLParser("<div><a href=" ">Hello</a> <i>world</i>!</div>")
    html_parser.body.unwrap_tags(['i', 'a'])
    assert html_parser.body.html == '<body><div>Hello world!</div></body>'
예제 #3
0
def test_insert_after():
    html_parser = HTMLParser('<div>Get <img src="" alt="Laptop"></div>')
    img = html_parser.css_first('img')
    img.insert_after(img.attributes.get('alt', ''))
    assert html_parser.body.child.html == '<div>Get <img src="" alt="Laptop">Laptop</div>'
예제 #4
0
def test_get_node_id(html, expected):
    html_parser = HTMLParser(html)
    node = html_parser.css_first('div')
    assert node.id == expected
예제 #5
0
def test_text_node_returns_text_when_deep():
    html = '<div>foo bar</div>'
    html_parser = HTMLParser(html)
    node = html_parser.css_first('div').child
    assert node.text(deep=True) == 'foo bar'
import requests
import os
from selectolax.parser import HTMLParser
import concurrent.futures

base_url = 'https://pythonbytes.fm'
headers = {"user-agent": "Mozilla/5.0 (Windows NT 11.5; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3629.169 Safari/537.36"}
output_dir = 'MP3s/'

if not os.path.isdir(output_dir):
	os.mkdir(output_dir)

def scrape_podcast(link):
	print('[*] Scraping', base_url + link)
	selectolax = HTMLParser(requests.get(base_url + link, headers=headers).content)
	dl_link = base_url + str(selectolax.css_first('a.btn.btn-default.subscribe-btn.btn-sm').attrs['href'])
	file_name = dl_link.split('/')[-1]
	print('[+] Downloading', file_name)
	with open(output_dir + file_name, 'wb') as file:
		file.write(requests.get(dl_link, headers=headers).content)

if __name__ == '__main__':
	with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
		executor.map(scrape_podcast, [node.attrs['href'] for node in HTMLParser(requests.get(base_url+ '/episodes/all', headers=headers).content).css('tr > :nth-child(3) > a')])
예제 #7
0
 def parse_html_page(self, page) -> str:
     # It's necessary render html page body using request_html to load all javascript
     page.html.render()
     tree = HTMLParser(page.html.html)
     return tree.css_first(".posttitle").text()
예제 #8
0
for dir in range(4):
    for num in range(1, dir_nums[dir] + 1):
        print("\rProgress: {:.2f}%".format(files_checked / num_files * 100),
              end='')

        file_name = dir_names[dir] + "/" + dir_names[dir] + "." + str(
            num) + ".html"

        try:
            f = open("../input-indexing/" + file_name, 'r')
        except IOError:
            continue

        content = f.read()

        html = HTMLParser(content)
        if html.body is None:
            continue

        for tag in html.css('script'):
            tag.decompose()
        for tag in html.css('style'):
            tag.decompose()

        content = html.body.text(separator='\n')

        text = tokenize_text(content)
        content = preprocess(content)
        content, snippets = index_words(content, text)

        for word in content:
예제 #9
0
 def parser(self):
     return HTMLParser(self.text)
예제 #10
0
async def get_weather(location: str) -> str:
    async with aiohttp.request('GET', domain + weather_request + quote(location), headers={'User-Agent': user_agent}) as resp:
        search_text = await resp.text()
        title = HTMLParser(search_text).css_first('title').text()
        possible_href = str(resp.url)

    if title != 'Яндекс.Погода':
        # if we got rerouted to weather
        weather_text = search_text
        exact_location = ''
        for node in HTMLParser(weather_text).css('span.breadcrumbs__title'):
            exact_location += node.text() + ','
        exact_location = exact_location[:-1]
        href = possible_href
    else:
        # if we got location list as we expected
        node = HTMLParser(search_text).css_first('div.grid__cell')
        if node is None:
            return f'По запросу "{location}" ничего не найдено'
        node = node.css_first('li.place-list__item')
        node = node.css_first('a')
        href = domain + node.attributes['href']
        exact_location = node.text()
        async with aiohttp.request('GET', href, headers={'User-Agent': user_agent}) as resp:
            weather_text = await resp.text()

    # parsing weather
    card = HTMLParser(weather_text).css_first('div.content__main').css_first('div.content__row').css_first('div.card')
    temp_info = card.css_first('div.fact__temp-wrap').css_first('a')
    now_temp = temp_info.css_first('div.fact__temp').css_first('span.temp__value').text()
    now_condition = temp_info.css_first('div.fact__feelings').css_first('div.link__condition').text()
    wind_info = card.css_first('div.fact__props').css_first('dl.fact__wind-speed').css_first('dd.term__value')
    now_wind = wind_info.css_first('span.wind-speed').text() + ' ' + wind_info.css_first('span.fact__unit').text()

    day_info = HTMLParser(weather_text).css_first('div.forecast-briefly').css_first('div.swiper-wrapper')
    # print(day_info.html)
    slide = None
    for day in day_info.css('div.swiper-slide'):
        text: str = day.text()
        if text.find('Сегодня') != -1:
            slide = day.css_first('a')

    day_temp = slide.css_first('div.forecast-briefly__temp_day').css_first('span.temp__value').text()
    night_temp = slide.css_first('div.forecast-briefly__temp_night').css_first('span.temp__value').text()
    condition = slide.css_first('div.forecast-briefly__condition').text()

    return f'Место: {exact_location}' \
           f'\n\nCЕЙЧАС:\nТемпература: {now_temp}\nСостояние: {now_condition}\nВетер: {now_wind}' \
           f'\n\nCЕГОДНЯ:\nТемпература днем: {day_temp}\nТемпература ночью: {night_temp}\nСостояние: {condition}'\
           f'\n\nПолный прогноз: {href}'
예제 #11
0
def cli(url, repositories, search, rows, minstar, token, output_file_name,
        max_repos_retrieved):

    MODE = os.environ.get("GHTOPDEP_ENV")
    REPOS_PER_FILE_SIZE_LIMIT = 3000

    if (search) and token:
        gh = github3.login(token=token)
        CacheControl(gh.session,
                     cache=FileCache(CACHE_DIR),
                     heuristic=OneDayHeuristic())
    elif (search) and not token:
        click.echo("Please provide token")
        sys.exit()

    destination = "repository"
    destinations = "repositories"
    if not repositories:
        destination = "package"
        destinations = "packages"

    repos = []
    more_than_zero_count = 0
    total_repos_count = 0
    # spinner = Halo(text="Fetching information about {0}".format(destinations), spinner="dots")
    # spinner.start()

    sess = requests.session()
    retries = Retry(total=15, backoff_factor=15, status_forcelist=[429])
    adapter = CacheControlAdapter(max_retries=retries,
                                  cache=FileCache(CACHE_DIR),
                                  heuristic=OneDayHeuristic())
    sess.mount("http://", adapter)
    sess.mount("https://", adapter)

    page_url = get_page_url(sess, url, destination)

    found_repos = 0
    total_found_repos = 0
    number_of_files_processed = 0

    while True:
        time.sleep(1)
        response = sess.get(page_url)

        print(page_url)

        parsed_node = HTMLParser(response.text)
        dependents = parsed_node.css(ITEM_SELECTOR)
        total_repos_count += len(dependents)
        for dep in dependents:
            repo_stars_list = dep.css(STARS_SELECTOR)
            # only for ghost or private? packages
            if repo_stars_list:
                repo_stars = repo_stars_list[0].text().strip()
                repo_stars_num = int(repo_stars.replace(",", ""))
            else:
                continue

            if repo_stars_num != 0:
                more_than_zero_count += 1
            if repo_stars_num >= minstar:
                relative_repo_url = dep.css(
                    REPO_SELECTOR)[0].attributes["href"]
                repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url)

                # can be listed same package
                is_already_added = already_added(repo_url, repos)
                if not is_already_added and repo_url != url:
                    # print("adding repo ", repo_url)
                    found_repos += 1
                    total_found_repos += 1

                    repos.append({"url": repo_url, "stars": repo_stars_num})

                    if found_repos >= REPOS_PER_FILE_SIZE_LIMIT:
                        sorted_repos = repos
                        repos = []
                        number_of_files_processed += 1
                        found_repos = 0

                        show_result(sorted_repos, total_repos_count,
                                    more_than_zero_count, destinations,
                                    number_of_files_processed,
                                    output_file_name)

                        print("JSON output placed into file!")

                    if total_found_repos > max_repos_retrieved:
                        print(f'Collected {total_found_repos} repos.')
                        exit

        node = parsed_node.css(NEXT_BUTTON_SELECTOR)
        if len(node) == 2:
            page_url = node[1].attributes["href"]
        elif len(node) == 0 or node[0].text() == "Previous":
            # spinner.stop()
            break
        elif node[0].text() == "Next":
            page_url = node[0].attributes["href"]

    sorted_repos = repos

    if search:
        for repo in repos:
            repo_path = urlparse(repo["url"]).path[1:]
            for s in gh.search_code("{0} repo:{1}".format(search, repo_path)):
                click.echo("{0} with {1} stars".format(s.html_url,
                                                       repo["stars"]))
    elif number_of_files_processed == 0:
        show_result(sorted_repos, total_repos_count, more_than_zero_count,
                    destinations, number_of_files_processed, output_file_name)
예제 #12
0
def product_desc(html):
    desc = HTMLParser(html).css_first("article > p").text()
    return desc
예제 #13
0
def stock_num(html):
    num = HTMLParser(html).css_first("p.availability").text()
    num = re.sub(r"\D", "", num)
    return num
예제 #14
0
def book_rating(html):
    ratings = [
        node.attributes.get("class").strip("star-rating ")
        for node in HTMLParser(html).css("p.star-rating")
    ]
    return ratings
예제 #15
0
파일: ghtopdep.py 프로젝트: linnit/ghtopdep
def cli(url, repositories, rows, minstar, description, token):
    if description and token:
        gh = github3.login(token=token)
        CacheControl(gh.session, cache=FileCache(".ghtopdep_cache"), heuristic=OneDayHeuristic())
        Repo = namedtuple("Repo", ["url", "stars", "description"])
    elif description and not token:
        click.echo("Please provide token")
    else:
        Repo = namedtuple("Repo", ["url", "stars"])

    destination = "repository"
    destinations = "repositories"
    if not repositories:
        destination = "package"
        destinations = "packages"
    page_url = "{0}/network/dependents?dependent_type={1}".format(url, destination.upper())

    repos = []
    more_than_zero_count = 0
    total_repos_count = 0
    spinner = Halo(text="Fetching information about {0}".format(destinations), spinner="dots")
    spinner.start()
    sess = requests.session()
    cached_sess = CacheControl(sess, cache=FileCache(".ghtopdep_cache"), heuristic=OneDayHeuristic())
    while True:
        response = cached_sess.get(page_url)
        parsed_node = HTMLParser(response.text)
        dependents = parsed_node.css(ITEM_SELECTOR)
        total_repos_count += len(dependents)
        for dep in dependents:
            repo_stars_list = dep.css(STARS_SELECTOR)
            # only for ghost or private? packages
            if repo_stars_list:
                repo_stars = dep.css(STARS_SELECTOR)[0].text().strip()
                repo_stars_num = int(repo_stars.replace(",", ""))
            else:
                continue

            if repo_stars_num != 0:
                more_than_zero_count += 1
            if repo_stars_num >= minstar:
                relative_repo_url = dep.css(REPO_SELECTOR)[0].attributes["href"]
                repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url)

                # can be listed same package
                is_already_added = already_added(repo_url, repos)
                if not is_already_added and repo_url != url:
                    if description:
                        repo_description = fetch_description(gh, relative_repo_url)
                        repos.append(Repo(repo_url, repo_stars_num, repo_description))
                    else:
                        repos.append(Repo(repo_url, repo_stars_num))

        node = parsed_node.css(NEXT_BUTTON_SELECTOR)
        if len(node) == 2:
            page_url = node[1].attributes["href"]
        elif len(node) == 0 or node[0].text() == "Previous":
            spinner.stop()
            break
        elif node[0].text() == "Next":
            page_url = node[0].attributes["href"]

    sorted_repos = sort_repos(repos, rows)
    show_result(sorted_repos, total_repos_count, more_than_zero_count, destination, destinations)
예제 #16
0
    def fetch_resource_iteratively(self, ingest_type: str, base_url: str,
                                   force_recrawl: bool) -> dict:
        """
        This is copypasta from process_file(), should probably refactor.
        """

        result: Dict[str, Any] = dict(hit=False)
        result["hops"] = [base_url]
        next_url = base_url

        # check against blocklist
        for block in self.base_url_blocklist:
            # NOTE: hack to not skip archive.org content
            if "archive.org" in block:
                continue
            if block in next_url:
                result["status"] = "skip-url-blocklist"
                return result

        try:
            resource = self.find_resource(next_url,
                                          force_recrawl=force_recrawl)
        except SavePageNowError as e:
            result["status"] = "spn2-error"
            result["error_message"] = str(e)[:1600]
            return result
        except PetaboxError as e:
            result["status"] = "petabox-error"
            result["error_message"] = str(e)[:1600]
            return result
        except CdxApiError as e:
            result["status"] = "cdx-error"
            result["error_message"] = str(e)[:1600]
            # add a sleep in cdx-error path as a slow-down
            time.sleep(2.0)
            return result
        except WaybackError as e:
            result["status"] = "wayback-error"
            result["error_message"] = str(e)[:1600]
            return result
        except WaybackContentError as e:
            result["status"] = "wayback-content-error"
            result["error_message"] = str(e)[:1600]
            return result
        except NotImplementedError:
            # result['status'] = 'not-implemented'
            # result['error_message'] = str(e)[:1600]
            # return result
            resource = None

        html_biblio = None
        if resource:
            if resource.terminal_url:
                result["terminal"] = {
                    "terminal_url": resource.terminal_url,
                    "terminal_dt": resource.terminal_dt,
                    "terminal_status_code": resource.terminal_status_code,
                }
                if resource.terminal_url not in result["hops"]:
                    result["hops"].append(resource.terminal_url)

            if not resource.hit:
                result["status"] = resource.status
                return result

            if resource.terminal_url:
                for pattern in self.base_url_blocklist:
                    if pattern in resource.terminal_url:
                        result["status"] = "skip-url-blocklist"
                        return result

            if resource.terminal_url:
                for pattern in self.cookie_blocklist:
                    if pattern in resource.terminal_url:
                        result["status"] = "blocked-cookie"
                        return result

            if not resource.body:
                result["status"] = "null-body"
                return result

            if len(resource.body) > MAX_BODY_SIZE_BYTES:
                result["status"] = "body-too-large"
                return result

            file_meta = gen_file_metadata(resource.body)
            try:
                file_meta, resource = fix_transfer_encoding(
                    file_meta, resource)
            except Exception as e:
                result["status"] = "bad-gzip-encoding"
                result["error_message"] = str(e)
                return result

            if not resource.body or file_meta["size_bytes"] == 0:
                result["status"] = "null-body"
                return result

            # here we split based on ingest type to try and extract a next hop
            html_ish_resource = bool(
                "html" in file_meta["mimetype"] or "xhtml"
                in file_meta["mimetype"]  # matches "application/xhtml+xml"
                or "application/xml" in file_meta["mimetype"]
                or "text/xml" in file_meta["mimetype"])
            html_biblio = None
            html_doc = None
            if html_ish_resource and resource.body:
                try:
                    html_doc = HTMLParser(resource.body)
                    html_biblio = html_extract_biblio(resource.terminal_url,
                                                      html_doc)
                    if html_biblio:
                        if "html_biblio" not in result and html_biblio.title:
                            result["html_biblio"] = json.loads(
                                html_biblio.json(exclude_none=True))
                            # print(f"  setting html_biblio: {result['html_biblio']}", file=sys.stderr)
                except ValueError:
                    pass

            # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
            assert resource
            assert resource.hit is True
            assert resource.terminal_status_code in (200, 226)

            if resource.terminal_url:
                result["terminal"] = {
                    "terminal_url": resource.terminal_url,
                    "terminal_dt": resource.terminal_dt,
                    "terminal_status_code": resource.terminal_status_code,
                    "terminal_sha1hex": file_meta["sha1hex"],
                }

            result["file_meta"] = file_meta
            result["cdx"] = cdx_to_dict(resource.cdx)
            if resource.revisit_cdx:
                result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx)

            if ingest_type == "pdf":
                if file_meta["mimetype"] != "application/pdf":
                    result[
                        "status"] = "wrong-mimetype"  # formerly: "other-mimetype"
                    return result
            elif ingest_type == "xml":
                if file_meta["mimetype"] not in (
                        "application/xml",
                        "text/xml",
                        "application/jats+xml",
                ):
                    result["status"] = "wrong-mimetype"
                    return result
            elif ingest_type == "html":
                if file_meta["mimetype"] not in ("text/html",
                                                 "application/xhtml+xml"):
                    result["status"] = "wrong-mimetype"
                    return result
            else:
                # raise NotImplementedError()
                pass

        result["_html_biblio"] = html_biblio
        result["_resource"] = resource
        return result
예제 #17
0
 def multi_parser(self, html, link):
     #soup = BeautifulSoup(html, 'html.parser')
     if "khan" in link:
         selector = "#articleBody > p"
     elif "kmib" in link:
         selector = "#articleBody"
     elif "kookje" in link:
         selector = ".news_article"
     elif "naeil" in link:
         selector = "#contents > p"
     elif "donga" in link:
         selector = ".article_txt"  #동아 좆까
     elif "dt.co.kr" in link:
         selector = ".art_txt"
     elif "mk.co.kr" in link:  #빅카인즈 주소 오류ㅠ
         selector = "#article_body"
     elif "imaeil" in link:
         selector = ".article_area > p"
     elif "moneytoday" in link:
         selector = "#textBody"  #리디렉션 오류
     elif "munhwa" in link:
         selector = "#NewsAdContent"
     elif "sedaily" in link:
         selector = ".view_con"
     elif "segye" in link:
         selector = "#article_txt > article > p"
     elif "asiae." in link:
         selector = "#txt_area > p"
     elif "ajunews." in link:
         selector = "#articleBody"
     elif "etnews." in link:
         selector = "#articleBody > p"
     elif "chosun." in link:
         selector = "#news_body_id"
     elif "joins." in link:
         selector = "#article_body"
     elif "fnnews." in link:
         selector = "#article_content"
     elif "hani." in link:
         selector = "#contents-article .text"
     elif "hankyung." in link:
         selector = "#articletxt"
     elif "hankookilbo." in link:  #BigKinds 주소오류
         selector = "#article_story"
     elif "heraldcorp." in link:
         selector = "#articleText > p"
     elif "kbs." in link:
         selector = "#cont_newstext"
     elif "imbc." in link:
         selector = ".txt"
     elif "obsnews." in link:
         selector = "#CmAdContent"
     elif "sbs." in link:
         selector = ".text_area"
     elif "ytn." in link:
         selector = "#CmAdContent > span"
     elif "naver." in link:
         selector = "#articleBodyContents"
     else:
         self.error_cnt['parse'] += 1
         return "ERR"
     text = ""
     for node in HTMLParser(html).css(selector):
         text += node.text()
     result = re.sub('\xa0', '', text)
     return result.split(".")
예제 #18
0
def test_css_first_default():
    html = "<span></span><div><p class='p3'>text</p><p class='p3'>sd</p></div><p></p>"
    selector = ".s3"
    assert HTMLParser(html).css_first(selector,
                                      default='lorem ipsum') == 'lorem ipsum'
예제 #19
0
 def parse_html_page(self, page):
     selector = ".markdown-body > p:nth-child(4)"
     tree = HTMLParser(page.text)
     with suppress(IndexError):
         return tree.css(selector)[0].text()
     return ""
예제 #20
0
def test_malformed_attributes():
    html = '<div> <meta name="description" content="ÐаÑ"Р" /></div>'
    html_parser = HTMLParser(html)

    for tag in html_parser.tags('meta'):
        assert tag
예제 #21
0
 def parse_html_page(self, page) -> str:
     tree = HTMLParser(page.text)
     return tree.css_first("h1").text()
예제 #22
0
def cli(url, repositories, search, table, rows, minstar, report, description,
        token):
    MODE = os.environ.get("GHTOPDEP_ENV")
    BASE_URL = 'https://437w61gcj1.execute-api.us-west-2.amazonaws.com/api'
    if MODE == "development":
        BASE_URL = 'http://127.0.0.1:8080'

    if report:
        try:
            result = requests.get('{}/repos?url={}'.format(BASE_URL, url))
            if result.status_code != 404:
                sorted_repos = sort_repos(result.json()['deps'], rows)
                repos = readable_stars(sorted_repos)
                click.echo(tabulate(repos, headers="keys", tablefmt="github"))
                sys.exit()
        except requests.exceptions.ConnectionError as e:
            click.echo(e)

    if (description or search) and token:
        gh = github3.login(token=token)
        CacheControl(gh.session,
                     cache=FileCache(CACHE_DIR),
                     heuristic=OneDayHeuristic())
    elif (description or search) and not token:
        click.echo("Please provide token")
        sys.exit()

    destination = "repository"
    destinations = "repositories"
    if not repositories:
        destination = "package"
        destinations = "packages"
    page_url = "{0}/network/dependents?dependent_type={1}".format(
        url, destination.upper())

    repos = []
    more_than_zero_count = 0
    total_repos_count = 0
    spinner = Halo(text="Fetching information about {0}".format(destinations),
                   spinner="dots")
    spinner.start()

    sess = requests.session()
    retries = Retry(total=15, backoff_factor=15, status_forcelist=[429])
    adapter = CacheControlAdapter(max_retries=retries,
                                  cache=FileCache(CACHE_DIR),
                                  heuristic=OneDayHeuristic())
    sess.mount("http://", adapter)
    sess.mount("https://", adapter)

    while True:
        response = sess.get(page_url)
        parsed_node = HTMLParser(response.text)
        dependents = parsed_node.css(ITEM_SELECTOR)
        total_repos_count += len(dependents)
        for dep in dependents:
            repo_stars_list = dep.css(STARS_SELECTOR)
            # only for ghost or private? packages
            if repo_stars_list:
                repo_stars = repo_stars_list[0].text().strip()
                repo_stars_num = int(repo_stars.replace(",", ""))
            else:
                continue

            if repo_stars_num != 0:
                more_than_zero_count += 1
            if repo_stars_num >= minstar:
                relative_repo_url = dep.css(
                    REPO_SELECTOR)[0].attributes["href"]
                repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url)

                # can be listed same package
                is_already_added = already_added(repo_url, repos)
                if not is_already_added and repo_url != url:
                    if description:
                        repo_description = fetch_description(
                            gh, relative_repo_url)
                        repos.append({
                            "url": repo_url,
                            "stars": repo_stars_num,
                            "description": repo_description
                        })
                    else:
                        repos.append({
                            "url": repo_url,
                            "stars": repo_stars_num
                        })

        node = parsed_node.css(NEXT_BUTTON_SELECTOR)
        if len(node) == 2:
            page_url = node[1].attributes["href"]
        elif len(node) == 0 or node[0].text() == "Previous":
            spinner.stop()
            break
        elif node[0].text() == "Next":
            page_url = node[0].attributes["href"]

    if report:
        try:
            requests.post('{}/repos'.format(BASE_URL),
                          json={
                              "url": url,
                              "deps": repos
                          })
        except requests.exceptions.ConnectionError as e:
            click.echo(e)

    sorted_repos = sort_repos(repos, rows)

    if search:
        for repo in repos:
            repo_path = urlparse(repo["url"]).path[1:]
            for s in gh.search_code("{0} repo:{1}".format(search, repo_path)):
                click.echo("{0} with {1} stars".format(s.html_url,
                                                       repo["stars"]))
    else:
        show_result(sorted_repos, total_repos_count, more_than_zero_count,
                    destinations, table)
예제 #23
0
def test_html_attribute_works_for_text():
    html = '<div>foo bar</div>'
    html_parser = HTMLParser(html)
    node = html_parser.css_first('div').child
    assert node.html == 'foo bar'
예제 #24
0
def extract_infos(headers, content):
    data = dict()
    headers = {k.lower(): v for k, v in headers.items()}

    # check wp version
    wp_version = re.findall(
        r'wp-(?:emoji-release|embed)\.min\.js.*ver=(.*?)[\"\']', content)
    if wp_version:
        wp_version = wp_version[0]

    cms = 'Default'
    version = 'version'

    dom = HTMLParser(content)
    for tag in dom.tags('meta'):
        attrs = tag.attributes
        if 'name' in attrs:
            if 'generator' == attrs['name'].lower():
                cms = attrs['content']
                version = re.findall(r'\d+\.*\d*\.*\d*', cms)
                if version:
                    version = version[0]
                cms = re.sub(re.escape(version), '', cms).strip()

    if cms == 'Default':
        if 'x-powered-by' in headers.keys():
            cms = headers.get('x-powered-by')
            if 'x-aspnet-version' in headers.keys():
                version = headers.get('x-aspnet-version')
        elif 'magento' in content.lower():
            cms = 'Magento'
        elif 'shopify' in content.lower():
            cms = 'Shopify'
        elif 'squarespace' in content.lower():
            cms = 'Squarespace'
        elif 'blogger.com' in content.lower():
            cms = 'Blogger'
        elif 'typo3' in content.lower():
            cms = 'TYPO3'
        elif 'opencart' in content.lower():
            cms = 'OpenCart'
        elif 'joomla' in content.lower():
            cms = 'Joomla'
        elif 'prestashop' in content.lower():
            cms = 'Prestashop'
        elif 'wordpress' in content.lower():
            cms = 'Wordpress'
        elif 'drupal' in content.lower():
            cms = 'Drupal'

    data['cms'] = cms
    if wp_version:
        data['version'] = wp_version
    else:
        data['version'] = version

    for key in headers.keys():
        if 'server' == key or 'x-server' == key:
            data['server'] = headers.get(key)
        if key.startswith('x-') and headers.get(key) not in data.values():
            data[key] = headers.get(key)

    plugins = re.findall(r'wp-content/plugins/(.*?)/.*ver=(.*?)[\s\'\"]',
                         content)
    if plugins:
        data = append_info(plugins, data, 'Plugins')
    wp_themes = re.findall(r'/wp-content/themes/(.*)/.*?ver=(.*?)[\s\'\"]',
                           content)
    if wp_themes:
        data = append_info(wp_themes, data, 'Themes')

    drupal_modules = re.findall(r'/modules/.*/(.*?)\.css\?v=(.*?)[\s\"\']',
                                content)
    if drupal_modules:
        data = append_info(drupal_modules, data, 'Plugins')

    drupal_themes = re.findall(r'/themes/.*?/(.*)/css.*?v=(.*?)[\s\'\"]',
                               content)
    if drupal_themes:
        data = append_info(drupal_themes, data, 'Themes')

    return data
예제 #25
0
def test_unwrap():
    html = '<a id="url" href="https://rushter.com/">I linked to <i>rushter.com</i></a>'
    html_parser = HTMLParser(html)
    node = html_parser.css_first('i')
    node.unwrap()
    assert html_parser.body.child.html == '<a id="url" href="https://rushter.com/">I linked to rushter.com</a>'
예제 #26
0
def leopold_sold_out_html(leopold_sold_out) -> HTMLParser:
    yield HTMLParser(leopold_sold_out.read())
예제 #27
0
def test_replace_with_multiple_nodes():
    html_parser = HTMLParser(
        '<div>Get <span alt="Laptop"><img src="/jpg"> <div>/div></span></div>')
    img = html_parser.css_first('span')
    img.replace_with(img.attributes.get('alt', ''))
    assert html_parser.body.child.html == '<div>Get Laptop</div>'
예제 #28
0
def leopold_in_stock_html(leopold_in_stock) -> HTMLParser:
    yield HTMLParser(leopold_in_stock.read())
예제 #29
0
def test_attrs_sets_attribute():
    html_parser = HTMLParser('<div id="id"></div>')
    node = html_parser.css_first('div')
    node.attrs['id'] = 'new_id'
    assert node.attributes == {'id': 'new_id'}
예제 #30
0
def in_stock(html):
    stock_status = [node.text() for node in HTMLParser(html).css("p.instock")]
    stock_status = [re.sub(r"\W", "", i) for i in stock_status]
    return stock_status