def fetch(self) -> None: try: r = requests.get(self.url, headers=self.headers) except ConnectionError: self.is_invalid = True return r.encoding = "utf-8" page = MetadataParser(html=r.text) self.title = page.get_metadata("title") self.description = page.get_metadata("description") self.image = page.get_metadata("image")
def get_or_create_ressource(self, url): try: ressource = self.get(url=url) raise Exception except Ressource.DoesNotExist: ressource = Ressource(url=url) md_strategy = ['og', 'dc', 'page', 'meta', ] md = MetadataParser(url=url) ressource.title = md.get_metadata('title', ) ressource.excerpt = md.get_metadata('description', ) ressource.image = md.get_metadata('image', ) ressource.save() return ressource
def url_matcher(self, msg, match): url = match.group(0) r = requests.head(url) max_size = self.config['DOC_MAX_SIZE'] max_len = self.config['DOC_MAX_LEN'] # files that are too big cause trouble. Let's just ignore them. if 'content-length' in r.headers and \ int(r.headers['content-length']) > max_size: return # ignore anything that is not allowed in configuration allowed_content_types = self.config['ALLOWED_CONTENT_TYPES'] content_type = '' if 'content-type' in r.headers: content_type = re.sub(r'\s*\;.*$', '', r.headers['content-type']) content_type = content_type.strip() if content_type not in allowed_content_types: return html = requests.get(url).text readable_article = Document(html).summary() readable_article = self.text_cleanup(readable_article) if len(readable_article) > max_len: readable_article = readable_article[:max_len] + '...' readable_title = Document(html).title() page = MetadataParser(html=html) readable_description = page.get_metadata('description') if readable_description is None: readable_description = '' readable_description = self.text_cleanup(readable_description) description = '' if len(readable_description) > len(readable_article): description = readable_description else: description = readable_article if description: return "~> {}\n~> {}\n~> {}".format(url, readable_title, description) else: return "~> {}\n~> {}".format(url, readable_title)
from metadata_parser import MetadataParser import pdb import pprint # hey use lxml >= 2.3.5 ; use 3.x though! # otherwise this site will break ! http://www.nasa.gov/externalflash/discovery/index.html if 0: a = MetadataParser(url='http://cnn.com') print(a.get_metadata('title')) b = MetadataParser(url='http://nyt.com') print(b.get_metadata('title')) c = MetadataParser(url='http://thedailybeast.com') print(c.get_metadata('title')) print("\n-------------------------------------------------------\n") print(a.metadata) print("\n-------------------------------------------------------\n") print(b.metadata) print("\n-------------------------------------------------------\n") print(c.metadata) print("\n-------------------------------------------------------\n") print(c.get_metadata('title')) print(c.get_metadata('canonical')) print(c.get_metadata('url')) print(c.absolute_url(c.get_metadata('canonical'))) print(c.absolute_url(c.get_metadata('url'))) print(c.get_discrete_url())
def parsearticle(article, pathuuid): mainimage = {} images = [] req = requests.get( "http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"])) print("http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"])) articletext = MetadataParser(html=json.loads(req.text)['html']) imgurl = str(articletext.get_metadata('image')) if not imgurl.startswith("http"): imgurl = 'http:' + imgurl imgurlnopost = imgurl.rsplit('?', 1)[0] imgname = imgurlnopost.rsplit('/', 1)[-1] imgpath = pathuuid + '/' + imgname + str(uuid.uuid4()) publication = json.loads(article.decode('utf-8'))["publication"] category = json.loads(article.decode('utf-8'))["category"] title = json.loads(article.decode('utf-8'))["title"] articleurl = json.loads(article.decode('utf-8'))["link"] geturl = None os.mkdir(pathuuid) count = 0 try: geturl = urllib.request.urlretrieve(imgurl, imgpath) except: pass while not geturl: req = requests.get("http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus( json.loads(article.decode('utf-8'))["link"])) articletext = MetadataParser(html=json.loads(req.text)['html']) imgurl = str(articletext.get_metadata('image')) imgurlnopost = imgurl.rsplit('?', 1)[0] imgname = imgurlnopost.rsplit('/', 1)[-1] try: geturl = urllib.request.urlretrieve(imgurl, imgpath) count += 1 except: if count > 10: raise ValueError('Article failed too many times') pass mainimage['imgurl'] = imgurl mainimage['imgname'] = imgname mainimage['imgpath'] = imgpath mainimage['content_type'] = geturl[1]['Content-Type'] images.append(mainimage) images1 = getimages( json.loads(req.text)['html'], json.loads(req.text)['tree']['frameTree']['resources'], images, pathuuid) try: articletext = fulltext(json.loads(req.text)['html'], language='en') except: articletext = "" thing = {} thing['title'] = json.loads(article.decode('utf-8'))["title"] thing['articletext'] = articletext thing['summary'] = summarize(articletext) thing['assets'] = images1 thing['publication'] = publication thing['category'] = category thing['articleurl'] = articleurl thing['html'] = json.loads(req.text)['html'] return thing
from metadata_parser import MetadataParser if 0: a= MetadataParser(url='http://cnn.com') print a.get_metadata('title') b= MetadataParser(url='http://nyt.com') print b.get_metadata('title') c= MetadataParser(url='http://thedailybeast.com') print c.get_metadata('title') print "\n-------------------------------------------------------\n" print a.metadata print "\n-------------------------------------------------------\n" print b.metadata print "\n-------------------------------------------------------\n" print c.metadata print "\n-------------------------------------------------------\n" print c.get_metadata('title') print c.get_metadata('canonical') print c.get_metadata('url') print c.absolute_url(c.get_metadata('canonical')) print c.absolute_url(c.get_metadata('url')) print c.get_discrete_url() if 0: a= MetadataParser(url='http://liqr.co/rsvpnewyork') print "title:" print a.get_metadata('title')
"group by url order by count(*) desc;" cur.execute(sql) urls = cur.fetchall() i = 0 for url in urls: i = i + 1 url = remove_characters(url[0]) try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'From': '*****@*****.**' # This is another valid field } page = MetadataParser(url=url, requests_timeout=5, url_headers=headers) title = remove_characters(page.get_metadata('title')) url_resolved = remove_characters(page.get_metadata('url')) image = remove_characters(page.get_metadata('image')) description = remove_characters(page.get_metadata('description')) sql = "insert into url_meta (title, description, url, url_md5, image) " \ "values ('" + title + "', '" + description + "', '" + url_resolved + "', md5('" + url + "'), '" + image + "');" except Exception as e: e = remove_characters(str(e)) sql = "insert into url_meta (title, description, url, url_md5, image) " \ "values ('error', '" + e + "', '" + url + "', md5('" + url + "'), '');" finally: cur.execute(sql) cur.execute("commit;") if i % 100 == 0: print i
from metadata_parser import MetadataParser from opengraph import OpenGraph import webpreview url = 'https://health.usnews.com/wellness/health-buzz/articles/2018-01-05/smelling-your-partners-shirt-could-decrease-your-stress-levels-study-says' page = MetadataParser(url=url) print page.metadata print page.get_metadata('title') og = OpenGraph(url=url) print og wb = webpreview.OpenGraph(url, ['og:title', 'og:description']) print wb.title print wb.description