def apply_srem(input_raster_file: str, metadata_file: str, output_raster_file: str) -> None: out_dir = os.path.join(os.path.dirname(output_raster_file)) os.makedirs(out_dir, exist_ok=True) m_parser = MetadataParser(metadata_file) band_id = BAND_ID[os.path.splitext(input_raster_file)[0][-3:]].value platform = m_parser.get_platform() wavelength = WAVELENGTHS[platform][band_id] angles = m_parser.get_mean_angles(band_id) with rasterio.open(input_raster_file) as src: toa_reflectance = src.read(1) / REFLECTANCE_SCALING_FACTOR profile = src.profile nodata_mask = (toa_reflectance == 0) surface_reflectance = srem.srem(toa_reflectance=toa_reflectance, wavelength=wavelength, **angles) scaled_surface_reflectance = \ surface_reflectance * REFLECTANCE_SCALING_FACTOR # crop values less than 1 for defining 1 as minimum value. scaled_surface_reflectance[scaled_surface_reflectance < 1] = 1 scaled_surface_reflectance[nodata_mask] = 0 profile.update(driver='GTiff', compress='deflate', nodata=0) with rasterio.open(output_raster_file, 'w', **profile) as dst: dst.write(scaled_surface_reflectance.astype(profile['dtype']), indexes=1)
def test_get_mean_angles(S2A_metadata_file, angle_keys): m_parser = MetadataParser(S2A_metadata_file) for band_id in BAND_ID: angles = m_parser.get_mean_angles(band_id.value) assert set(angles.keys()) == angle_keys for angle in angles.values(): assert isinstance(angle, float)
def fetch(self) -> None: try: r = requests.get(self.url, headers=self.headers) except ConnectionError: self.is_invalid = True return r.encoding = "utf-8" page = MetadataParser(html=r.text) self.title = page.get_metadata("title") self.description = page.get_metadata("description") self.image = page.get_metadata("image")
def url_matcher(self, msg, match): url = match.group(0) r = requests.head(url) max_size = self.config['DOC_MAX_SIZE'] max_len = self.config['DOC_MAX_LEN'] # files that are too big cause trouble. Let's just ignore them. if 'content-length' in r.headers and \ int(r.headers['content-length']) > max_size: return # ignore anything that is not allowed in configuration allowed_content_types = self.config['ALLOWED_CONTENT_TYPES'] content_type = '' if 'content-type' in r.headers: content_type = re.sub(r'\s*\;.*$', '', r.headers['content-type']) content_type = content_type.strip() if content_type not in allowed_content_types: return html = requests.get(url).text readable_article = Document(html).summary() readable_article = self.text_cleanup(readable_article) if len(readable_article) > max_len: readable_article = readable_article[:max_len] + '...' readable_title = Document(html).title() page = MetadataParser(html=html) readable_description = page.get_metadata('description') if readable_description is None: readable_description = '' readable_description = self.text_cleanup(readable_description) description = '' if len(readable_description) > len(readable_article): description = readable_description else: description = readable_article if description: return "~> {}\n~> {}\n~> {}".format(url, readable_title, description) else: return "~> {}\n~> {}".format(url, readable_title)
def __init__(self, fpga_hostname = 'localhost', rffe_hostname = 'localhost', debug = False): self.fpga_hostname = fpga_hostname self.rffe_hostname = rffe_hostname self.debug = debug from metadata_parser import MetadataParser self.metadata_parser = MetadataParser()
def create_link_preview(page: MetadataParser, page_meta: dict, url: str) -> Optional[str]: """ Create a preview bookmark card from a URL. :param MetadataParser page: Page object create from URL to be parsed. :param dict page_meta: Page metadata parsed from the head of the target URL. :param str url: URL of the linked third-party post/article. :returns: Optional[str] """ try: title, description, page_type = parse_scraped_metadata(page_meta) image = page.get_metadata_link("image", allow_encoded_uri=True, require_public_global=True) if title is not None and description is not None: preview = f"\n\n<b>{title}</b>\n{description}\n{url}" if page_type: preview += f"\n{page_type.title()}" if image: preview += f"\n{image}" return preview except Exception as e: LOGGER.error( f"Unexpected error while generating link preview card: {e}")
def scrape_metadata_from_url(url: str) -> Optional[str]: """ Fetch metadata for a given URL. :param str url: Link to third-party content, for which to create a link preview. :returns: Optional[str] """ try: # Parse page metadata as dict page = MetadataParser( url=url, url_headers=headers, search_head_only=True, only_parse_http_ok=True, raise_on_invalid=True, ) page_meta = page.parsed_result.metadata if page_meta: return create_link_preview(page, page_meta, url) except HTTPError as e: LOGGER.error(f"Failed to fetch metadata for URL `{url}`: {e}") except RequestException as e: LOGGER.error( f"RequestException error while scraping metadata for URL `{url}`: {e}" ) except InvalidDocument as e: LOGGER.error( f"InvalidDocument encountered while fetching metadata for URL `{url}`: {e}" ) except Exception as e: LOGGER.error( f"Unexpected error while scraping metadata for URL `{url}`: {e}")
def fetch_metadata(self) -> None: if self.album_art or self.title: return if not self.md: req = Request('GET', self.song_link, headers={'User-Agent': 'curl/7.54.0'}) prepped = req.prepare() s = requests.Session() r = s.send(prepped) if r.status_code == 200: self.share_link = r.url mp = MetadataParser(html=r.text, search_head_only=True) self.md = mp.metadata self.title = self.md['og']['title'] image_link = self.md['og']['image'] if image_link[0:5] == 'http:': image_link = 'https:' + image_link[5:] self.album_art = image_link
def get_or_create_ressource(self, url): try: ressource = self.get(url=url) raise Exception except Ressource.DoesNotExist: ressource = Ressource(url=url) md_strategy = ['og', 'dc', 'page', 'meta', ] md = MetadataParser(url=url) ressource.title = md.get_metadata('title', ) ressource.excerpt = md.get_metadata('description', ) ressource.image = md.get_metadata('image', ) ressource.save() return ressource
def get_first_appearance(page: metadata_parser.MetadataParser, prop: str) -> str: """ obtiene el primer elemento para la propiedad en la pagina pasada """ prop_data = page.get_metadatas(prop) if type(prop_data) is list: return prop_data[0] else: return None
def fetch_metadata(html: str) -> Tuple[str, str, str]: """ Extracting page metadata. :param html: HTML of the document :return: Tuple of title, keywords and descriptions """ parser = MetadataParser(html=html, search_head_only=True) return ( parser.metadata['page'].get('title', ''), parser.metadata['meta'].get('keywords', ''), parser.metadata['meta'].get('description', '') )
def get_metadata_parser(cls, file_path: str) -> MetadataParser: """This method returns a metadata parser compatible with the file at the specified path""" with open(file_path) as metadata_file: header_line = metadata_file.readline() if "METADATA:2.0" in header_line: # parse this file with MetadataV2 parser return MetadataParser(file_path) if ";" not in header_line: # parse as no version return MetadataParserLegacy(file_path) # parse as versions 1.0.1 -> 1.1.8 return MetadataParserLegacy(file_path) return None
def fetch_president_articles(): from metadata_parser import MetadataParser created = 0 updated = 0 articles_to_fetch = PresidentCandidateArticle.objects.filter( information__isnull=True) for article in articles_to_fetch: page = MetadataParser(url=article.url) title = first_or_none(page.get_metadatas('title')) description = first_or_none(page.get_metadatas('description')) site = first_or_none(page.get_metadatas('site_name')) url = page.get_url_canonical() url = url if url else article.url image_url = page.get_metadata_link('image') information_obj, is_created = PresidentCandidateArticleInformation.objects.update_or_create( article=article, defaults={ 'title': title, 'description': description, 'site': site, 'url': url }) save_image_from_url(information_obj.image, image_url) if is_created: created += 1 else: updated += 1 return {'created': created, 'updated': updated}
from metadata_parser import MetadataParser from opengraph import OpenGraph import webpreview url = 'https://health.usnews.com/wellness/health-buzz/articles/2018-01-05/smelling-your-partners-shirt-could-decrease-your-stress-levels-study-says' page = MetadataParser(url=url) print page.metadata print page.get_metadata('title') og = OpenGraph(url=url) print og wb = webpreview.OpenGraph(url, ['og:title', 'og:description']) print wb.title print wb.description
from metadata_parser import MetadataParser if 0: a= MetadataParser(url='http://cnn.com') print a.get_metadata('title') b= MetadataParser(url='http://nyt.com') print b.get_metadata('title') c= MetadataParser(url='http://thedailybeast.com') print c.get_metadata('title') print "\n-------------------------------------------------------\n" print a.metadata print "\n-------------------------------------------------------\n" print b.metadata print "\n-------------------------------------------------------\n" print c.metadata print "\n-------------------------------------------------------\n" print c.get_metadata('title') print c.get_metadata('canonical') print c.get_metadata('url') print c.absolute_url(c.get_metadata('canonical')) print c.absolute_url(c.get_metadata('url')) print c.get_discrete_url() if 0: a= MetadataParser(url='http://liqr.co/rsvpnewyork') print "title:" print a.get_metadata('title')
from metadata_parser import MetadataParser import pdb import pprint # hey use lxml >= 2.3.5 ; use 3.x though! # otherwise this site will break ! http://www.nasa.gov/externalflash/discovery/index.html if 0: a = MetadataParser(url='http://cnn.com') print(a.get_metadata('title')) b = MetadataParser(url='http://nyt.com') print(b.get_metadata('title')) c = MetadataParser(url='http://thedailybeast.com') print(c.get_metadata('title')) print("\n-------------------------------------------------------\n") print(a.metadata) print("\n-------------------------------------------------------\n") print(b.metadata) print("\n-------------------------------------------------------\n") print(c.metadata) print("\n-------------------------------------------------------\n") print(c.get_metadata('title')) print(c.get_metadata('canonical')) print(c.get_metadata('url')) print(c.absolute_url(c.get_metadata('canonical'))) print(c.absolute_url(c.get_metadata('url'))) print(c.get_discrete_url())
from metadata_parser import MetadataParser # hey use lxml >= 2.3.5 ; use 3.x though! # otherwise this site will break ! http://www.nasa.gov/externalflash/discovery/index.html if 0: a = MetadataParser(url='http://cnn.com') print(a.get_metadata('title')) b = MetadataParser(url='http://nyt.com') print(b.get_metadata('title')) c = MetadataParser(url='http://thedailybeast.com') print(c.get_metadata('title')) print("\n-------------------------------------------------------\n") print(a.metadata) print("\n-------------------------------------------------------\n") print(b.metadata) print("\n-------------------------------------------------------\n") print(c.metadata) print("\n-------------------------------------------------------\n") print(c.get_metadata('title')) print(c.get_metadata('canonical')) print(c.get_metadata('url')) print(c.absolute_url(c.get_metadata('canonical'))) print(c.absolute_url(c.get_metadata('url'))) print(c.get_discrete_url()) if 0:
def __init__(self, html): self.meta = MetadataParser(html=html).metadata try: self.data = json.load(json.dumps(FlatterDict(self.meta))) except: self.data = self.meta
def social_card_image(page_url): parser = MetadataParser(url=page_url, search_head_only=True) link = parser.get_metadata_link('image', strategy=['og']) return link
def parsearticle(article, pathuuid): mainimage = {} images = [] req = requests.get( "http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"])) print("http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus(json.loads(article.decode('utf-8'))["link"])) articletext = MetadataParser(html=json.loads(req.text)['html']) imgurl = str(articletext.get_metadata('image')) if not imgurl.startswith("http"): imgurl = 'http:' + imgurl imgurlnopost = imgurl.rsplit('?', 1)[0] imgname = imgurlnopost.rsplit('/', 1)[-1] imgpath = pathuuid + '/' + imgname + str(uuid.uuid4()) publication = json.loads(article.decode('utf-8'))["publication"] category = json.loads(article.decode('utf-8'))["category"] title = json.loads(article.decode('utf-8'))["title"] articleurl = json.loads(article.decode('utf-8'))["link"] geturl = None os.mkdir(pathuuid) count = 0 try: geturl = urllib.request.urlretrieve(imgurl, imgpath) except: pass while not geturl: req = requests.get("http://" + os.getenv("RENDER_HOST") + ":3000/render/" + urllib.parse.quote_plus( json.loads(article.decode('utf-8'))["link"])) articletext = MetadataParser(html=json.loads(req.text)['html']) imgurl = str(articletext.get_metadata('image')) imgurlnopost = imgurl.rsplit('?', 1)[0] imgname = imgurlnopost.rsplit('/', 1)[-1] try: geturl = urllib.request.urlretrieve(imgurl, imgpath) count += 1 except: if count > 10: raise ValueError('Article failed too many times') pass mainimage['imgurl'] = imgurl mainimage['imgname'] = imgname mainimage['imgpath'] = imgpath mainimage['content_type'] = geturl[1]['Content-Type'] images.append(mainimage) images1 = getimages( json.loads(req.text)['html'], json.loads(req.text)['tree']['frameTree']['resources'], images, pathuuid) try: articletext = fulltext(json.loads(req.text)['html'], language='en') except: articletext = "" thing = {} thing['title'] = json.loads(article.decode('utf-8'))["title"] thing['articletext'] = articletext thing['summary'] = summarize(articletext) thing['assets'] = images1 thing['publication'] = publication thing['category'] = category thing['articleurl'] = articleurl thing['html'] = json.loads(req.text)['html'] return thing
"(select distinct url_md5 from url_meta) " \ "group by url order by count(*) desc;" cur.execute(sql) urls = cur.fetchall() i = 0 for url in urls: i = i + 1 url = remove_characters(url[0]) try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 'From': '*****@*****.**' # This is another valid field } page = MetadataParser(url=url, requests_timeout=5, url_headers=headers) title = remove_characters(page.get_metadata('title')) url_resolved = remove_characters(page.get_metadata('url')) image = remove_characters(page.get_metadata('image')) description = remove_characters(page.get_metadata('description')) sql = "insert into url_meta (title, description, url, url_md5, image) " \ "values ('" + title + "', '" + description + "', '" + url_resolved + "', md5('" + url + "'), '" + image + "');" except Exception as e: e = remove_characters(str(e)) sql = "insert into url_meta (title, description, url, url_md5, image) " \ "values ('error', '" + e + "', '" + url + "', md5('" + url + "'), '');" finally: cur.execute(sql) cur.execute("commit;") if i % 100 == 0: print i
class BPMExperiment(): def __init__(self, fpga_hostname = 'localhost', rffe_hostname = 'localhost', debug = False): self.fpga_hostname = fpga_hostname self.rffe_hostname = rffe_hostname self.debug = debug from metadata_parser import MetadataParser self.metadata_parser = MetadataParser() def load_from_metadata(self, input_metadata_filename): # Parse metadata file into a dictionary self.metadata_parser.parse(input_metadata_filename) self.metadata = self.metadata_parser.options def get_metadata_lines(self): experiment_parameters = list(self.metadata.keys()) lines = [] for key in experiment_parameters: lines.append(key + ' = ' + self.metadata[key] + '\n') return lines def run(self, data_filename, datapath): if datapath == 'adc': data_rate_decim_factor = '1' acq_channel = '0' acq_npts = '100000' elif datapath == 'tbt': data_rate_decim_factor = self.metadata['adc_clock_sampling_harmonic'].split()[0] # FIXME: data_rate_decim_factor should be ideally read from FPGA acq_channel = '1' acq_npts = '100000' elif datapath == 'fofb': data_rate_decim_factor = '1000' # FIXME: data_rate_decim_factor should be ideally read from FPGA acq_channel = '3' acq_npts = '1000000' deswitching_phase_offset = str(int(self.metadata['dsp_deswitching_phase'].split()[0]) - int(self.metadata['rffe_switching_phase'].split()[0])) import subprocess # Run FPGA configuration commands command_argument_list = ['fcs_client'] command_argument_list.extend(['--setdivclk', self.metadata['rffe_switching_frequency_ratio'].split()[0]]) command_argument_list.extend(['--setkx', self.metadata['bpm_Kx'].split()[0]]) command_argument_list.extend(['--setky', self.metadata['bpm_Ky'].split()[0]]) command_argument_list.extend(['--setphaseclk', deswitching_phase_offset]) command_argument_list.extend(['--setsw' + self.metadata['rffe_switching'].split()[0]]) command_argument_list.extend(['--setwdw' + self.metadata['dsp_sausaging'].split()[0]]) command_argument_list.extend(['--setsamples', acq_npts]) command_argument_list.extend(['--setchan', acq_channel]) command_argument_list.extend(['--setfpgahostname', self.fpga_hostname]) if not self.debug: subprocess.call(command_argument_list) else: print(command_argument_list) # Run RFFE configuration commands command_argument_list = ['fcs_client'] command_argument_list.extend(['--setfesw' + self.metadata['rffe_switching'].split()[0]]) att_items = self.metadata['rffe_attenuators'].split(',') i = 1 for item in att_items: item.strip() command_argument_list.extend(['--setfeatt' + str(i), item.split()[0]]) i = i+1 command_argument_list.extend(['--setrffehostname', self.rffe_hostname]) if not self.debug: subprocess.call(command_argument_list) else: print(command_argument_list) # TODO: Check if everything was properly set # Timestamp the start of data acquisition # FIXME: timestamp should ideally come together with data. from time import time t = time() # Run acquisition command_argument_list = ['fcs_client'] command_argument_list.append('--startacq') command_argument_list.extend(['--setfpgahostname', self.fpga_hostname]) if not self.debug: p = subprocess.call(command_argument_list) else: print(command_argument_list) # The script execution is blocked here until data acquisition has completed # Get the result of data acquisition and write it to data file command_argument_list = ['fcs_client'] command_argument_list.extend(['--getcurve', acq_channel]) command_argument_list.extend(['--setfpgahostname', self.fpga_hostname]) # Ensure file path exists import os path = os.path.dirname(data_filename) try: os.makedirs(path) except OSError as exception: if not os.path.isdir(path): raise f = open(data_filename, 'x') if not self.debug: p = subprocess.call(command_argument_list, stdout=f) else: f.writelines(['10 11 -9 80\n54 5 6 98\n']); print(command_argument_list) f.close() # Compute data file signature f = open(data_filename, 'r') text = f.read() f.close() import hashlib if self.metadata['data_signature_method'].split()[0] == 'md5': md = hashlib.md5() elif self.metadata['data_signature_method'].split()[0] == 'sha-1': md = hashlib.sha1() elif self.metadata['data_signature_method'].split()[0] == 'sha-256': md = hashlib.sha256() md.update(text.encode(f.encoding)) filesignature = md.hexdigest() # Format date and hour as an standard UTC timestamp (ISO 8601) from time import strftime, gmtime from math import floor ns = int(floor((t * 1e9) % 1e9)) timestamp_start = '%s.%09dZ' % (strftime('%Y-%m-%dT%H:%M:%S', gmtime(t)), ns) # Trhow away absolute path of data filename data_filename_basename = os.path.basename(data_filename) # Build metadata file based on template metadata file and post-processed metadata config_base_metadata_lines = self.get_metadata_lines() config_automatic_lines = []; config_automatic_lines.append('data_original_filename = ' + data_filename_basename + '\n') config_automatic_lines.append('data_signature = ' + filesignature + '\n') config_automatic_lines.append('dsp_data_rate_decim_factor = ' + data_rate_decim_factor + '\n') config_automatic_lines.append('timestamp_start = ' + timestamp_start + '\n') #config_automatic_lines.append('adc_board_temperature = ' + '0' + ' C\n') #TODO: implement ADC temperature read on FPGA #config_automatic_lines.append('rffe_board_temperature = ' + '0' + ' C\n') #TODO: implement RFFE temperature read on FPGA config_fromfile_lines = [] config_fromfile_lines.extend(config_base_metadata_lines) config_fromfile_lines.extend(config_automatic_lines) # Metadata file is placed in the same path and with the same filename as the data file, but with .metadata extension from os.path import basename output_metadata_filename = os.path.splitext(data_filename)[0] + '.metadata' f = open(output_metadata_filename, 'x') f.writelines(sorted(config_fromfile_lines)) f.close()
def _test_get_platform(metadata_file, expected_platform): m_parser = MetadataParser(metadata_file) platform = m_parser.get_platform() assert platform == expected_platform