示例#1
0
def scrape_metadata_from_url(url: str) -> Optional[str]:
    """
    Fetch metadata for a given URL.

    :param str url: Link to third-party content, for which to create a link preview.

    :returns: Optional[str]
    """
    try:
        # Parse page metadata as dict
        page = MetadataParser(
            url=url,
            url_headers=headers,
            search_head_only=True,
            only_parse_http_ok=True,
            raise_on_invalid=True,
        )
        page_meta = page.parsed_result.metadata
        if page_meta:
            return create_link_preview(page, page_meta, url)
    except HTTPError as e:
        LOGGER.error(f"Failed to fetch metadata for URL `{url}`: {e}")
    except RequestException as e:
        LOGGER.error(
            f"RequestException error while scraping metadata for URL `{url}`: {e}"
        )
    except InvalidDocument as e:
        LOGGER.error(
            f"InvalidDocument encountered while fetching metadata for URL `{url}`: {e}"
        )
    except Exception as e:
        LOGGER.error(
            f"Unexpected error while scraping metadata for URL `{url}`: {e}")
示例#2
0
def apply_srem(input_raster_file: str, metadata_file: str,
               output_raster_file: str) -> None:
    out_dir = os.path.join(os.path.dirname(output_raster_file))
    os.makedirs(out_dir, exist_ok=True)
    m_parser = MetadataParser(metadata_file)
    band_id = BAND_ID[os.path.splitext(input_raster_file)[0][-3:]].value
    platform = m_parser.get_platform()
    wavelength = WAVELENGTHS[platform][band_id]
    angles = m_parser.get_mean_angles(band_id)

    with rasterio.open(input_raster_file) as src:
        toa_reflectance = src.read(1) / REFLECTANCE_SCALING_FACTOR
        profile = src.profile
        nodata_mask = (toa_reflectance == 0)

    surface_reflectance = srem.srem(toa_reflectance=toa_reflectance,
                                    wavelength=wavelength,
                                    **angles)
    scaled_surface_reflectance = \
        surface_reflectance * REFLECTANCE_SCALING_FACTOR
    # crop values less than 1 for defining 1 as minimum value.
    scaled_surface_reflectance[scaled_surface_reflectance < 1] = 1
    scaled_surface_reflectance[nodata_mask] = 0

    profile.update(driver='GTiff', compress='deflate', nodata=0)
    with rasterio.open(output_raster_file, 'w', **profile) as dst:
        dst.write(scaled_surface_reflectance.astype(profile['dtype']),
                  indexes=1)
示例#3
0
    def fetch_metadata(self) -> None:

        if self.album_art or self.title:
            return

        if not self.md:
            req = Request('GET',
                          self.song_link,
                          headers={'User-Agent': 'curl/7.54.0'})
            prepped = req.prepare()
            s = requests.Session()
            r = s.send(prepped)

            if r.status_code == 200:
                self.share_link = r.url

                mp = MetadataParser(html=r.text, search_head_only=True)
                self.md = mp.metadata
                self.title = self.md['og']['title']
                image_link = self.md['og']['image']

                if image_link[0:5] == 'http:':
                    image_link = 'https:' + image_link[5:]

                self.album_art = image_link
示例#4
0
def test_get_mean_angles(S2A_metadata_file, angle_keys):
    m_parser = MetadataParser(S2A_metadata_file)
    for band_id in BAND_ID:
        angles = m_parser.get_mean_angles(band_id.value)
        assert set(angles.keys()) == angle_keys
        for angle in angles.values():
            assert isinstance(angle, float)
示例#5
0
    def fetch(self) -> None:
        try:
            r = requests.get(self.url, headers=self.headers)
        except ConnectionError:
            self.is_invalid = True
            return

        r.encoding = "utf-8"
        page = MetadataParser(html=r.text)

        self.title = page.get_metadata("title")
        self.description = page.get_metadata("description")
        self.image = page.get_metadata("image")
示例#6
0
def fetch_metadata(html: str) -> Tuple[str, str, str]:
    """
    Extracting page metadata.
    :param html: HTML of the document
    :return: Tuple of title, keywords and descriptions
    """
    parser = MetadataParser(html=html, search_head_only=True)

    return (
        parser.metadata['page'].get('title', ''),
        parser.metadata['meta'].get('keywords', ''),
        parser.metadata['meta'].get('description', '')
    )
示例#7
0
    def get_metadata_parser(cls, file_path: str) -> MetadataParser:
        """This method returns a metadata parser compatible with the file at the specified path"""
        with open(file_path) as metadata_file:
            header_line = metadata_file.readline()
            if "METADATA:2.0" in header_line:
                # parse this file with MetadataV2 parser
                return MetadataParser(file_path)
            if ";" not in header_line:
                # parse as no version
                return MetadataParserLegacy(file_path)

            # parse as versions 1.0.1 -> 1.1.8
            return MetadataParserLegacy(file_path)
        return None
示例#8
0
    def url_matcher(self, msg, match):
        url = match.group(0)
        r = requests.head(url)
        max_size = self.config['DOC_MAX_SIZE']
        max_len = self.config['DOC_MAX_LEN']

        # files that are too big cause trouble. Let's just ignore them.
        if 'content-length' in r.headers and \
           int(r.headers['content-length']) > max_size:
            return

        # ignore anything that is not allowed in configuration
        allowed_content_types = self.config['ALLOWED_CONTENT_TYPES']
        content_type = ''
        if 'content-type' in r.headers:
            content_type = re.sub(r'\s*\;.*$', '', r.headers['content-type'])
            content_type = content_type.strip()

        if content_type not in allowed_content_types:
            return

        html = requests.get(url).text
        readable_article = Document(html).summary()
        readable_article = self.text_cleanup(readable_article)

        if len(readable_article) > max_len:
            readable_article = readable_article[:max_len] + '...'

        readable_title = Document(html).title()

        page = MetadataParser(html=html)
        readable_description = page.get_metadata('description')

        if readable_description is None:
            readable_description = ''

        readable_description = self.text_cleanup(readable_description)

        description = ''
        if len(readable_description) > len(readable_article):
            description = readable_description
        else:
            description = readable_article

        if description:
            return "~> {}\n~> {}\n~> {}".format(url, readable_title,
                                                description)
        else:
            return "~> {}\n~> {}".format(url, readable_title)
示例#9
0
def fetch_president_articles():
    from metadata_parser import MetadataParser

    created = 0
    updated = 0

    articles_to_fetch = PresidentCandidateArticle.objects.filter(
        information__isnull=True)

    for article in articles_to_fetch:
        page = MetadataParser(url=article.url)

        title = first_or_none(page.get_metadatas('title'))
        description = first_or_none(page.get_metadatas('description'))
        site = first_or_none(page.get_metadatas('site_name'))
        url = page.get_url_canonical()
        url = url if url else article.url
        image_url = page.get_metadata_link('image')

        information_obj, is_created = PresidentCandidateArticleInformation.objects.update_or_create(
            article=article,
            defaults={
                'title': title,
                'description': description,
                'site': site,
                'url': url
            })

        save_image_from_url(information_obj.image, image_url)

        if is_created:
            created += 1
        else:
            updated += 1

    return {'created': created, 'updated': updated}
示例#10
0
from metadata_parser import MetadataParser
import pdb
import pprint

# hey use lxml >= 2.3.5 ; use 3.x though!
# otherwise this site will break ! http://www.nasa.gov/externalflash/discovery/index.html

if 0:
    a = MetadataParser(url='http://cnn.com')
    print(a.get_metadata('title'))

    b = MetadataParser(url='http://nyt.com')
    print(b.get_metadata('title'))

    c = MetadataParser(url='http://thedailybeast.com')
    print(c.get_metadata('title'))

    print("\n-------------------------------------------------------\n")
    print(a.metadata)
    print("\n-------------------------------------------------------\n")
    print(b.metadata)
    print("\n-------------------------------------------------------\n")
    print(c.metadata)
    print("\n-------------------------------------------------------\n")

    print(c.get_metadata('title'))
    print(c.get_metadata('canonical'))
    print(c.get_metadata('url'))
    print(c.absolute_url(c.get_metadata('canonical')))
    print(c.absolute_url(c.get_metadata('url')))
    print(c.get_discrete_url())
def parsearticle(article, pathuuid):
    mainimage = {}
    images = []
    req = requests.get(
        "http://" + os.getenv("RENDER_HOST") + ":3000/render/" +
        urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"]))
    print("http://" + os.getenv("RENDER_HOST") + ":3000/render/" +
          urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"]))
    articletext = MetadataParser(html=json.loads(req.text)['html'])
    imgurl = str(articletext.get_metadata('image'))
    if not imgurl.startswith("http"):
        imgurl = 'http:' + imgurl
    imgurlnopost = imgurl.rsplit('?', 1)[0]
    imgname = imgurlnopost.rsplit('/', 1)[-1]
    imgpath = pathuuid + '/' + imgname + str(uuid.uuid4())
    publication = json.loads(article.decode('utf-8'))["publication"]
    category = json.loads(article.decode('utf-8'))["category"]
    title = json.loads(article.decode('utf-8'))["title"]
    articleurl = json.loads(article.decode('utf-8'))["link"]
    geturl = None
    os.mkdir(pathuuid)
    count = 0
    try:
        geturl = urllib.request.urlretrieve(imgurl, imgpath)
    except:
        pass
    while not geturl:
        req = requests.get("http://" + os.getenv("RENDER_HOST") +
                           ":3000/render/" + urllib.parse.quote_plus(
                               json.loads(article.decode('utf-8'))["link"]))
        articletext = MetadataParser(html=json.loads(req.text)['html'])
        imgurl = str(articletext.get_metadata('image'))
        imgurlnopost = imgurl.rsplit('?', 1)[0]
        imgname = imgurlnopost.rsplit('/', 1)[-1]
        try:
            geturl = urllib.request.urlretrieve(imgurl, imgpath)
            count += 1
        except:
            if count > 10:
                raise ValueError('Article failed too many times')
            pass
    mainimage['imgurl'] = imgurl
    mainimage['imgname'] = imgname
    mainimage['imgpath'] = imgpath
    mainimage['content_type'] = geturl[1]['Content-Type']
    images.append(mainimage)
    images1 = getimages(
        json.loads(req.text)['html'],
        json.loads(req.text)['tree']['frameTree']['resources'], images,
        pathuuid)
    try:
        articletext = fulltext(json.loads(req.text)['html'], language='en')
    except:
        articletext = ""
    thing = {}
    thing['title'] = json.loads(article.decode('utf-8'))["title"]
    thing['articletext'] = articletext
    thing['summary'] = summarize(articletext)
    thing['assets'] = images1
    thing['publication'] = publication
    thing['category'] = category
    thing['articleurl'] = articleurl
    thing['html'] = json.loads(req.text)['html']

    return thing
示例#12
0
from metadata_parser import MetadataParser
from opengraph import OpenGraph
import webpreview

url = 'https://health.usnews.com/wellness/health-buzz/articles/2018-01-05/smelling-your-partners-shirt-could-decrease-your-stress-levels-study-says'
page = MetadataParser(url=url)
print page.metadata
print page.get_metadata('title')

og = OpenGraph(url=url)
print og

wb = webpreview.OpenGraph(url, ['og:title', 'og:description'])
print wb.title
print wb.description
示例#13
0
def social_card_image(page_url):
    parser = MetadataParser(url=page_url, search_head_only=True)

    link = parser.get_metadata_link('image', strategy=['og'])

    return link
示例#14
0
 def __init__(self, html):
     self.meta = MetadataParser(html=html).metadata
     try:
         self.data = json.load(json.dumps(FlatterDict(self.meta)))
     except:
         self.data = self.meta
示例#15
0
def _test_get_platform(metadata_file, expected_platform):
    m_parser = MetadataParser(metadata_file)
    platform = m_parser.get_platform()
    assert platform == expected_platform