def scrape_metadata_from_url(url: str) -> Optional[str]: """ Fetch metadata for a given URL. :param str url: Link to third-party content, for which to create a link preview. :returns: Optional[str] """ try: # Parse page metadata as dict page = MetadataParser( url=url, url_headers=headers, search_head_only=True, only_parse_http_ok=True, raise_on_invalid=True, ) page_meta = page.parsed_result.metadata if page_meta: return create_link_preview(page, page_meta, url) except HTTPError as e: LOGGER.error(f"Failed to fetch metadata for URL `{url}`: {e}") except RequestException as e: LOGGER.error( f"RequestException error while scraping metadata for URL `{url}`: {e}" ) except InvalidDocument as e: LOGGER.error( f"InvalidDocument encountered while fetching metadata for URL `{url}`: {e}" ) except Exception as e: LOGGER.error( f"Unexpected error while scraping metadata for URL `{url}`: {e}")
def apply_srem(input_raster_file: str, metadata_file: str, output_raster_file: str) -> None: out_dir = os.path.join(os.path.dirname(output_raster_file)) os.makedirs(out_dir, exist_ok=True) m_parser = MetadataParser(metadata_file) band_id = BAND_ID[os.path.splitext(input_raster_file)[0][-3:]].value platform = m_parser.get_platform() wavelength = WAVELENGTHS[platform][band_id] angles = m_parser.get_mean_angles(band_id) with rasterio.open(input_raster_file) as src: toa_reflectance = src.read(1) / REFLECTANCE_SCALING_FACTOR profile = src.profile nodata_mask = (toa_reflectance == 0) surface_reflectance = srem.srem(toa_reflectance=toa_reflectance, wavelength=wavelength, **angles) scaled_surface_reflectance = \ surface_reflectance * REFLECTANCE_SCALING_FACTOR # crop values less than 1 for defining 1 as minimum value. scaled_surface_reflectance[scaled_surface_reflectance < 1] = 1 scaled_surface_reflectance[nodata_mask] = 0 profile.update(driver='GTiff', compress='deflate', nodata=0) with rasterio.open(output_raster_file, 'w', **profile) as dst: dst.write(scaled_surface_reflectance.astype(profile['dtype']), indexes=1)
def fetch_metadata(self) -> None: if self.album_art or self.title: return if not self.md: req = Request('GET', self.song_link, headers={'User-Agent': 'curl/7.54.0'}) prepped = req.prepare() s = requests.Session() r = s.send(prepped) if r.status_code == 200: self.share_link = r.url mp = MetadataParser(html=r.text, search_head_only=True) self.md = mp.metadata self.title = self.md['og']['title'] image_link = self.md['og']['image'] if image_link[0:5] == 'http:': image_link = 'https:' + image_link[5:] self.album_art = image_link
def test_get_mean_angles(S2A_metadata_file, angle_keys): m_parser = MetadataParser(S2A_metadata_file) for band_id in BAND_ID: angles = m_parser.get_mean_angles(band_id.value) assert set(angles.keys()) == angle_keys for angle in angles.values(): assert isinstance(angle, float)
def fetch(self) -> None: try: r = requests.get(self.url, headers=self.headers) except ConnectionError: self.is_invalid = True return r.encoding = "utf-8" page = MetadataParser(html=r.text) self.title = page.get_metadata("title") self.description = page.get_metadata("description") self.image = page.get_metadata("image")
def fetch_metadata(html: str) -> Tuple[str, str, str]: """ Extracting page metadata. :param html: HTML of the document :return: Tuple of title, keywords and descriptions """ parser = MetadataParser(html=html, search_head_only=True) return ( parser.metadata['page'].get('title', ''), parser.metadata['meta'].get('keywords', ''), parser.metadata['meta'].get('description', '') )
def get_metadata_parser(cls, file_path: str) -> MetadataParser: """This method returns a metadata parser compatible with the file at the specified path""" with open(file_path) as metadata_file: header_line = metadata_file.readline() if "METADATA:2.0" in header_line: # parse this file with MetadataV2 parser return MetadataParser(file_path) if ";" not in header_line: # parse as no version return MetadataParserLegacy(file_path) # parse as versions 1.0.1 -> 1.1.8 return MetadataParserLegacy(file_path) return None
def url_matcher(self, msg, match): url = match.group(0) r = requests.head(url) max_size = self.config['DOC_MAX_SIZE'] max_len = self.config['DOC_MAX_LEN'] # files that are too big cause trouble. Let's just ignore them. if 'content-length' in r.headers and \ int(r.headers['content-length']) > max_size: return # ignore anything that is not allowed in configuration allowed_content_types = self.config['ALLOWED_CONTENT_TYPES'] content_type = '' if 'content-type' in r.headers: content_type = re.sub(r'\s*\;.*$', '', r.headers['content-type']) content_type = content_type.strip() if content_type not in allowed_content_types: return html = requests.get(url).text readable_article = Document(html).summary() readable_article = self.text_cleanup(readable_article) if len(readable_article) > max_len: readable_article = readable_article[:max_len] + '...' readable_title = Document(html).title() page = MetadataParser(html=html) readable_description = page.get_metadata('description') if readable_description is None: readable_description = '' readable_description = self.text_cleanup(readable_description) description = '' if len(readable_description) > len(readable_article): description = readable_description else: description = readable_article if description: return "~> {}\n~> {}\n~> {}".format(url, readable_title, description) else: return "~> {}\n~> {}".format(url, readable_title)
def fetch_president_articles(): from metadata_parser import MetadataParser created = 0 updated = 0 articles_to_fetch = PresidentCandidateArticle.objects.filter( information__isnull=True) for article in articles_to_fetch: page = MetadataParser(url=article.url) title = first_or_none(page.get_metadatas('title')) description = first_or_none(page.get_metadatas('description')) site = first_or_none(page.get_metadatas('site_name')) url = page.get_url_canonical() url = url if url else article.url image_url = page.get_metadata_link('image') information_obj, is_created = PresidentCandidateArticleInformation.objects.update_or_create( article=article, defaults={ 'title': title, 'description': description, 'site': site, 'url': url }) save_image_from_url(information_obj.image, image_url) if is_created: created += 1 else: updated += 1 return {'created': created, 'updated': updated}
from metadata_parser import MetadataParser import pdb import pprint # hey use lxml >= 2.3.5 ; use 3.x though! # otherwise this site will break ! http://www.nasa.gov/externalflash/discovery/index.html if 0: a = MetadataParser(url='http://cnn.com') print(a.get_metadata('title')) b = MetadataParser(url='http://nyt.com') print(b.get_metadata('title')) c = MetadataParser(url='http://thedailybeast.com') print(c.get_metadata('title')) print("\n-------------------------------------------------------\n") print(a.metadata) print("\n-------------------------------------------------------\n") print(b.metadata) print("\n-------------------------------------------------------\n") print(c.metadata) print("\n-------------------------------------------------------\n") print(c.get_metadata('title')) print(c.get_metadata('canonical')) print(c.get_metadata('url')) print(c.absolute_url(c.get_metadata('canonical'))) print(c.absolute_url(c.get_metadata('url'))) print(c.get_discrete_url())
def parsearticle(article, pathuuid): mainimage = {} images = [] req = requests.get( "http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"])) print("http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"])) articletext = MetadataParser(html=json.loads(req.text)['html']) imgurl = str(articletext.get_metadata('image')) if not imgurl.startswith("http"): imgurl = 'http:' + imgurl imgurlnopost = imgurl.rsplit('?', 1)[0] imgname = imgurlnopost.rsplit('/', 1)[-1] imgpath = pathuuid + '/' + imgname + str(uuid.uuid4()) publication = json.loads(article.decode('utf-8'))["publication"] category = json.loads(article.decode('utf-8'))["category"] title = json.loads(article.decode('utf-8'))["title"] articleurl = json.loads(article.decode('utf-8'))["link"] geturl = None os.mkdir(pathuuid) count = 0 try: geturl = urllib.request.urlretrieve(imgurl, imgpath) except: pass while not geturl: req = requests.get("http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus( json.loads(article.decode('utf-8'))["link"])) articletext = MetadataParser(html=json.loads(req.text)['html']) imgurl = str(articletext.get_metadata('image')) imgurlnopost = imgurl.rsplit('?', 1)[0] imgname = imgurlnopost.rsplit('/', 1)[-1] try: geturl = urllib.request.urlretrieve(imgurl, imgpath) count += 1 except: if count > 10: raise ValueError('Article failed too many times') pass mainimage['imgurl'] = imgurl mainimage['imgname'] = imgname mainimage['imgpath'] = imgpath mainimage['content_type'] = geturl[1]['Content-Type'] images.append(mainimage) images1 = getimages( json.loads(req.text)['html'], json.loads(req.text)['tree']['frameTree']['resources'], images, pathuuid) try: articletext = fulltext(json.loads(req.text)['html'], language='en') except: articletext = "" thing = {} thing['title'] = json.loads(article.decode('utf-8'))["title"] thing['articletext'] = articletext thing['summary'] = summarize(articletext) thing['assets'] = images1 thing['publication'] = publication thing['category'] = category thing['articleurl'] = articleurl thing['html'] = json.loads(req.text)['html'] return thing
from metadata_parser import MetadataParser from opengraph import OpenGraph import webpreview url = 'https://health.usnews.com/wellness/health-buzz/articles/2018-01-05/smelling-your-partners-shirt-could-decrease-your-stress-levels-study-says' page = MetadataParser(url=url) print page.metadata print page.get_metadata('title') og = OpenGraph(url=url) print og wb = webpreview.OpenGraph(url, ['og:title', 'og:description']) print wb.title print wb.description
def social_card_image(page_url): parser = MetadataParser(url=page_url, search_head_only=True) link = parser.get_metadata_link('image', strategy=['og']) return link
def __init__(self, html): self.meta = MetadataParser(html=html).metadata try: self.data = json.load(json.dumps(FlatterDict(self.meta))) except: self.data = self.meta
def _test_get_platform(metadata_file, expected_platform): m_parser = MetadataParser(metadata_file) platform = m_parser.get_platform() assert platform == expected_platform