Пример #1
0
def get_paper_keywords(link):
    #поиск ключевых слов на странице
    paper_page = requests.get(link)
    paper_soup = BeautifulSoup(paper_page.content, 'html.parser')
    if len(paper_soup.head('meta', attrs={'name': 'keywords'})) != 0:
        return paper_soup.head('meta', attrs={'name': 'keywords'})[0]['content']
    else:
        print(link)
        return None
Пример #2
0
def highlight(url):
    r = requests.get(url)
    html_text = r.text
    soup = BeautifulSoup(html_text, "lxml")
    headSnippetSoup = BeautifulSoup(SNIPPET_HEADER, "lxml")
    bodySnippetSoup = BeautifulSoup(SNIPPET_BODY, "lxml")

    head_snippet = removeTags(headSnippetSoup)
    body_snippet = removeTags(bodySnippetSoup)

    head = soup.head
    head.insert(1, soup.new_tag('style', type='text/css'))
    head.style.append(highlight_css)
    head.insert(0, head_snippet)
    soup.head = head

    # soup.body = add_text(soup, " I didn't find any helpful answers here")
    body = soup.body
    body.insert(0, body_snippet)
    soup.body = body

    newsoup = Markup(soup)
    html = soup.prettify("utf-8")

    templates = "/SnippetIQ/templates/output_template.html"
    pwd = os.getcwd()
    filename = pwd + templates

    with open(filename, "wb") as file:
        file.write(html)

    return newsoup
    def _add_instant_tags(self, request, response):
        if hasattr(response, "content") and getattr(settings,
                                                    "WTM_INJECT_TAGS", True):
            strategy = TagStrategy(request)
            content = response.content.decode(response.charset)
            doc = BeautifulSoup(content, "html.parser")
            head = getattr(doc, "head", [])
            body = getattr(doc, "body", [])

            for tag in strategy.result:
                obj = tag.get("object")
                element = tag.get("element")

                if head and obj.tag_location == Tag.TOP_HEAD:
                    head.insert(1, element)
                elif head and obj.tag_location == Tag.BOTTOM_HEAD:
                    head.append(element)
                elif body and obj.tag_location == Tag.TOP_BODY:
                    body.insert(1, element)
                elif body and obj.tag_location == Tag.BOTTOM_BODY:
                    body.append(element)

            doc.head = head
            doc.body = body
            response.content = doc.encode(formatter=None)
            return response

        return response
Пример #4
0
def insert_stats(stats):
    print("Adding " + str(len(stats)) + " statistics to webpage")
    soup = BeautifulSoup(open(mapfile, "r").read(), "html.parser")
    body = soup.body
    head = soup.head

    head.append(soup.new_tag('style', type='text/css'))
    head.style.append(
        '#stats {background-color:#FF6766;\n\tposition:absolute;\n\ttop:5%;\n\tleft:5%;\n\tpadding:10px;\n\tz-index:999;\n\topacity:0.8;\n\tborder-radius:25px;}'
    )
    soup.head = head

    soup.head.title.string = "Map of Attempted Logins"

    stats_div = soup.new_tag('div', id='stats')

    stats_h = soup.new_tag('h1', id='stats_h')
    stats_h['font-weight'] = "bold"
    stats_h.string = "Statistics"

    stats_ul = soup.new_tag('ul')

    for stat in stats:
        new_h = soup.new_tag('h3')
        new_h.string = stat
        stats_ul.insert(len(stats_ul.contents), new_h)

    stats_div.insert(0, stats_h)
    stats_div.insert(1, stats_ul)

    body.insert(0, stats_div)

    newtxt = soup.prettify()
    with open(mapfile, "w") as f:
        f.write(newtxt)
Пример #5
0
def get_content(lon=None, lat=None):
    log.info(f"Pharmacy map creator started [{lon}][{lat}]")

    locator = Nominatim(user_agent='bot')
    radius = 1000

    # read full dataset
    raw = requests.get(
        "http://overpass-api.de/api/interpreter?data=<query type='node'><around lat='"
        + str(lat) + "' lon='" + str(lon) + "' radius='" + str(radius) +
        "'/></query><print/>").text

    data = BeautifulSoup(raw, features='xml')
    pharmacies = data.find_all('tag', {'k': 'amenity', 'v': 'pharmacy'})

    address = pd.Series([
        locator.reverse((pharmacy.parent['lat'], pharmacy.parent['lon']),
                        timeout=10000).address for pharmacy in pharmacies
    ],
                        dtype=str)
    latitude = pd.Series([pharmacy.parent['lat'] for pharmacy in pharmacies],
                         dtype=float)
    longitude = pd.Series([pharmacy.parent['lon'] for pharmacy in pharmacies],
                          dtype=float)

    data = pd.DataFrame({
        'address': address,
        'latitude': latitude,
        'longitude': longitude
    })
    data['distance'] = data.apply(lambda row: geopy.distance.distance(
        (row['latitude'], row['longitude']), (lat, lon)).km,
                                  axis=1)
    # getting area borders
    min_lat = data['latitude'].min()
    max_lat = data['latitude'].max()
    min_lon = data['longitude'].min()
    max_lon = data['longitude'].max()

    data = data.sort_values(by=['distance'])
    message = f"Ближайшие аптеки в радиусе {radius} метров \r\n \r\n"
    for index, point in data.head(min(10, len(data))).iterrows():
        message += f"⚕ [{round(point['distance'], 2)} км] {point['address']} \r\n"

    return map_extension.save_plot(log,
                                   data,
                                   min_lat,
                                   max_lat,
                                   min_lon,
                                   max_lon,
                                   pt_color='#00EB62FF',
                                   pt_size=1000,
                                   user_lat=lat,
                                   user_lon=lon), message
Пример #6
0
    def _add_instant_tags(self):
        if hasattr(self.response, "content") and getattr(
                settings, "WTM_INJECT_TAGS", True):
            doc = BeautifulSoup(self.response.content, "html.parser")
            head = getattr(doc, "head", [])
            body = getattr(doc, "body", [])

            for tag in self.strategy.result:
                obj = tag.get("object")
                element = tag.get("element")

                if head and obj.tag_location == Tag.TOP_HEAD:
                    head.insert(1, element)
                elif head and obj.tag_location == Tag.BOTTOM_HEAD:
                    head.append(element)
                elif body and obj.tag_location == Tag.TOP_BODY:
                    body.insert(1, element)
                elif body and obj.tag_location == Tag.BOTTOM_BODY:
                    body.append(element)

            doc.head = head
            doc.body = body
            self.response.content = doc.decode()
Пример #7
0
def fetch_html(url):
    urls = []
    words = ''
    title = ''
    desc = ''
    keywords = ''
    body = ''
    status = ''
    server = ''
    content_type = ''
    last_modified = ''
    err = 0
    url = re.compile(r"/$").sub('', url)
    url = re.compile(r"^http://").sub('', url)
    url = "http://" + url
    http = urllib3.PoolManager()
    response = http.request('GET', url)
    status = response.status
    server = response.headers['Server']
    content_type = response.headers['Content-Type']
    last_modified = response.headers['Date']
    print(
        response.status)  # 200: ('OK', 'Request fulfilled, document follows'),
    #  print class(response.status)
    if response.status != 200:
        status = response.status
        return (urls, body, title, desc, keywords, status, server,
                content_type, last_modified, err)
    print(response.headers)
    # print (response.data)
    soup = BeautifulSoup(response.data, "lxml")
    try:
        title = clean_html(soup.html.head.title.string)
        title = convert_accents(title)
    except:
        title = ''
    try:
        for meta in soup.head('meta'):
            ctxt = str(meta)
            pat = re.compile(r"meta[ ]*name[ ]*=[ ]*[\"]*key").findall(
                ctxt.lower())
            if pat:
                temp = re.compile(r"ontent[ ]*=[ ]*[\"]*").split(ctxt)
                if len(temp) > 1:
                    keywords = temp[1]
                    keywords = re.compile(r"[ ]*[\"]*[ ]*[/]*[>]").sub(
                        ' ', keywords)
                    keywords = clean_html(keywords)
                    keywords = convert_accents(keywords)
                    keywords = keywords.strip()
            pat = re.compile(r"meta[ ]*name[ ]*=[ ]*[\"]*descrip").findall(
                ctxt.lower())
            if pat:
                temp = re.compile(r"ontent[ ]*=[ ]*[\"]*").split(ctxt)
                if len(temp) > 1:
                    desc = temp[1]
                    desc = re.compile(r"[ ]*[\"]*[ ]*[/]*[>]").sub(' ', desc)
                    desc = convert_accents(desc)
                    desc = desc.strip()
    except:
        err = 1
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()
    for script in soup.findAll('script'):
        script.extract()
    for link in soup.findAll('a', href=True):
        if len(link['href']) > 9:
            pat = re.compile(r'^http').findall(link['href'])
            if pat:
                href = re.compile(r"/$").sub('', link['href'])
                temp = re.compile(r"\.").split(href.lower())
                size = len(temp) - 1
                urls.append(href)
    body = soup.body(text=True)
    body = ' '.join(body)
    body = convert_accents(body)
    body = clean_html(body)
    try:
        body = unicodedata.normalize('NFKD', body).encode('ascii', 'ignore')
    except:
        err = 2
    try:
        title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore')
    except:
        err = 3
    return (urls, body, title, desc, keywords, status, server, content_type,
            last_modified, err)
Пример #8
0
      error_code = 2
      return (urls,body,title,desc,keywords,error_code,error_reason,content_type,last_modified,err)
  err=0

  if info.has_key("content-type"):
    content_type = str(info["content-type"])
  if info.has_key("last-modified"):
    last_modified = str(info["last-modified"])
  soup=BeautifulSoup(data)
  try:
    title=cleanHTML(soup.html.head.title.string)
    title=convertAccents(title)
  except:
    title = ''
  try:
    for meta in soup.head('meta'):
      ctxt = str(meta)
      pat = re.compile(r"meta[ ]*name[ ]*=[ ]*[\"]*key").findall(ctxt.lower())
      if pat:
        temp=re.compile(r"ontent[ ]*=[ ]*[\"]*").split(ctxt)
        if len(temp) > 1:
          keywords=temp[1]
          keywords=re.compile(r"[ ]*[\"]*[ ]*[/]*[>]").sub(' ',keywords)
          keywords=cleanHTML(keywords)
          keywords=convertAccents(keywords)
          keywords=keywords.strip()
      pat = re.compile(r"meta[ ]*name[ ]*=[ ]*[\"]*descrip").findall(ctxt.lower())
      if pat:
        temp=re.compile(r"ontent[ ]*=[ ]*[\"]*").split(ctxt)
        if len(temp) > 1:
          desc=temp[1]
Пример #9
0
# Import part

import urllib
import urllib.request
import json
from bs4 import BeautifulSoup

# Page reading and converting

quote_page = 'https://yandex.by/' # setting the page to read
page = urllib.request.urlopen(quote_page) # opening the quoted page
data_raw = (page.read() # reading the page
soup_page = BeautifulSoup(page, 'html.parser') # create format readable by BeautifulSoup

# Start working with BeautifulSoup
head = soup_page.head()
#tag_found = soup.find(‘h1’, attrs={‘class’: ‘name’})
#tag_found = soup_page.find_all('a')
#name = tag_found.strip() # strip() is used to remove starting and trailing
print(head)




Пример #10
0
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 16 15:57:17 2019

@author: leona
"""

from bs4 import BeautifulSoup
import requests
import json
soup = BeautifulSoup(open('C:/Users/leona/Desktop/ADMHMK-3/movies2.html'),
                     "html.parser")
soup.head()
lst_a = soup.select('a')
urls = []
for i in lst_a:
    urls.append(i.get('href'))
urls[0]
soup = BeautifulSoup(open('C:/Users/leona/Desktop/ADMHMK-3/movies1.html'),
                     "html.parser")
soup.head()
lst_a = soup.select('a')
lst_a
for i in lst_a:
    urls.append(i.get('href'))
urls[10000]
soup = BeautifulSoup(open('C:/Users/leona/Desktop/ADMHMK-3/movies3.html'),
                     "html.parser")
soup.head()
lst_a = soup.select('a')
lst_a
Пример #11
0
from bs4 import BeautifulSoup

with open("./crawl/beautiful/story.html", "r") as f:
    response = f.read()

soup = BeautifulSoup(response, "html.parser")

print(soup.head())
print("**" * 10)
print(soup.title())
print("**" * 10)
print(soup.body())

print(soup.title.string)
print("**" * 10)
print(soup.title.parent)

print("**" * 10)
print("**" * 10)
print(soup.h1)

p1 = soup.p
print("p class name>>{}".format(p1['class']))

p2 = p1.find_next_sibling("p")
print("첫번쨰 p >> {}".format(p2))
print("p text >> {}".format(p2.string))
print("p gettext >>{}".format(p2.get_text()))
print("p class name >> {}".format(p2["class"]))

b = soup.b
Пример #12
0
__author__ = 'Martin'

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>

<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc)

if __name__ == '__main__':
    print soup.head()
    for key in soup.find_all('a'):
        print key.get('class'), key.get("href")
    print soup.findAll('a')