def test__fetch__only_http_protocols(self, document): with requests_mock.Mocker() as m: m.head("http://foo.bar", headers={'Content-Type': 'text/html; charset=utf-8'}) m.get("http://foo.bar", text=document) assert OpenGraph(url="http://foo.bar")._data != {} m.head("https://foo.bar", headers={'Content-Type': 'text/html; charset=utf-8'}) m.get("https://foo.bar", text=document) assert OpenGraph(url="https://foo.bar")._data != {} m.get("mailto:[email protected]", text=document) assert OpenGraph(url="mailto:[email protected]")._data == {} m.get("ftp://foo.bar", text=document) assert OpenGraph(url="ftp://foo.bar")._data == {}
def __fetch_open_graph_details(self): try: og = OpenGraph(url=self.url) if og.is_valid(): _json = json.loads(og.to_json()) self.title = self.title or _json['title'] self.description = self.description or _json['description'] self.image_url = self.image_url or _json['image'] except Exception as e: print(e.__doc__) print(e.message) return
def test_uses_right_parser(self, mock_bs, document): try: OpenGraph(html=document) except AttributeError: pass mock_bs.assert_called_once_with(document, "html.parser") mock_bs.reset_mock() try: OpenGraph(html=document, parser="spam") except AttributeError: pass mock_bs.assert_called_once_with(document, "spam")
def __init__(self, node_list, options=[]): Drawable.__init__(self, options) Named.__init__(self) self.__node_list = node_list cycle_graph = nx.Graph() if node_list: for n in range(1, len(node_list)): cycle_graph.add_edge(node_list[n - 1], node_list[n]) self.style().set_target_type('fill') self.__frame = OpenGraph(cycle_graph, options) self.__frame.style().set_target_type('frame') return
def og_scrape_worker(canonical_url): # Get open graph data try: og_data = OpenGraph(url="http://%s" % canonical_url) if og_data.is_valid(): return add_data_to_url_in_db(canonical_url, og_data) else: update_to_error_url_status_in_db(canonical_url) except Exception as e: print "Error occurred on scrap worker thread: %s" % e update_to_error_url_status_in_db(canonical_url)
def __fetch_open_graph_details(self): try: og = OpenGraph(url=self.url) if og.is_valid(): _json = json.loads(og.to_json()) self.name = _json['title'] img = Image(user=self.user, large_url=_json['image'], added_time=timezone.now()) img.save() self.image = img except Exception as e: print(e.__doc__) print(e.message) return
def __init__(self, url): og = OpenGraph(url) file_name, headers = urllib.request.urlretrieve(og.image) # call image magick to update file in place # cmd = 'mogrify -colorspace Gray -ordered-dither h4x4a -resize 512 %s' % file_name cmd = 'mogrify -colorspace Gray -resize 512 -unsharp 0x1 %s' % file_name os.system(cmd) # make an attempt to shorten the title and make it printable title = ascii( fix_text( og.title.split('Instagram post by ').pop().split( 'Instagram: ').pop())).replace('\\u2022', '-').replace('\'', '') p = printer.File("/dev/usb/lp0") p.text(title) p.text("\n") time.sleep(2) p.image(file_name) time.sleep(2) p.text("\n") p.text(og.url) p.cut() print(title) print(file_name) urllib.request.urlcleanup()
def form_valid(self, form): og = OpenGraph(form.instance.url) form.instance.og_title = og.title if 'title' in og else '' form.instance.og_description = og.description if 'description' in og else '' form.instance.og_image = og.image if 'image' in og else '' form.instance.og_type = '' if not 'type' in og else og.type form.instance.user = self.request.user return super().form_valid(form)
def get_url_data(url): try: raw = urlopen(url) if raw is None: return None html = raw.read() data = OpenGraph(html=html, scrape=True) if data.is_valid(): return dict(data) else: return None except (HTTPError, URLError, AttributeError): return None
def test_loading_from_url(self): url = 'http://foo.bar.com/' responses.add(responses.GET, url, body=self.test_document, status=200, content_type='text/html') og = OpenGraph(url=url) self.assertEqual(og.title, 'Test title')
def index(): """ обработка REST-запроса GET """ if request.method == 'GET' and request.args.get('url'): url = request.args.get('url') res = json.dumps(OpenGraph(url, HEADERS), ensure_ascii=False) else: res = 'Incorrect request. Try: url=https://...' return render_template('index.html', data=res)
def __init__(self, node_list): Drawable.__init__(self) Named.__init__(self) self.__node_list = node_list cycle_graph = nx.Graph() if node_list: for n in range(1, len(node_list)): cycle_graph.add_edge(node_list[n - 1], node_list[n]) self.style().set_target_type("fill") self.__frame = OpenGraph(cycle_graph) self.__frame.style().set_target_type("frame") return
def _save_opengraph_data_for_url(url): """ Given a url, save it's open graph data if it has it. If the Open Graph data doesn't exist then move on. """ with record('tasks.reddit._save_opengraph_data_for_url'): og = OpenGraph(url=url) item = models.Item(title=og.title, link=url, description=og.description, image=getattr(og, 'image', None), source=models.Item.SOURCES['reddit']) item.save()
def form_valid(self, form): if 'http' not in form.instance.url: form.instance.url = 'https://' + form.instance.url else: form.instance.url = form.instance.url og = OpenGraph(form.instance.url) form.instance.date = date.today() form.instance.og_title = og.title if 'title' in og else '' form.instance.og_description = og.description if 'description' in og else '' form.instance.og_image = og.image if 'image' in og else '' form.instance.og_type = '' if not 'type' in og else og.type form.instance.user = self.request.user return super().form_valid(form)
def test_loading_from_url(self): def http_callback(request): # Ugly, but using thread locals in order to capture the request # headers in the callback, to assert that it's being set correctly data.headers = request.headers return (200, {'content-type': 'text/html'}, self.test_document) url = 'http://foo.bar.com/' useragent = 'python-opengraph/0.0' responses.add_callback(responses.GET, url, callback=http_callback, content_type='text/html') og = OpenGraph(url=url, useragent=useragent) headers = data.headers self.assertEqual(og.title, 'Test title') self.assertEqual(headers['user-agent'], useragent)
def _save_opengraph_data_for_urls(urls): """ Given a set of urls, try to save each one's open graph data if it has it. If the Open Graph data doesn't exist then move on. Stop after the first one has been successfully parsed and saved. """ with record('tasks.reddit._save_opengraph_data_for_url'): for url in urls: try: og = OpenGraph(url=url) item = models.Item(title=og.title, link=url, description=og.description, image=getattr(og, 'image', None), source=models.Item.SOURCES['reddit']) item.save() break except Exception: pass
def fetch_og_preview(content, urls): """Fetch first opengraph entry for a list of urls.""" for url in urls: # See first if recently cached already if OpenGraphCache.objects.filter(url=url, modified__gte=now() - datetime.timedelta(days=7)).exists(): opengraph = OpenGraphCache.objects.get(url=url) Content.objects.filter(id=content.id).update(opengraph=opengraph) return opengraph # OpenGraph is kinda broken - make sure we destroy any old data before fetching OpenGraph.__data__ = {} try: og = OpenGraph(url=url) except (requests.exceptions.ConnectionError, AttributeError): continue if not og or ("title" not in og and "site_name" not in og and "description" not in og and "image" not in og): continue try: title = og.title if "title" in og else og.site_name if "site_name" in og else "" description = og.description if "description" in og else "" image = og.image if "image" in og and not content.is_nsfw else "" try: with transaction.atomic(): opengraph = OpenGraphCache.objects.create( url=url, title=truncate_letters(safe_text(title), 250), description=safe_text(description), image=safe_text(image), ) except DataError: continue except IntegrityError: # Some other process got ahead of us opengraph = OpenGraphCache.objects.get(url=url) Content.objects.filter(id=content.id).update(opengraph=opengraph) return opengraph Content.objects.filter(id=content.id).update(opengraph=opengraph) return opengraph return False
def add_articles(): Article.objects.delete(date__gte=(datetime.datetime.now() - datetime.timedelta(days=2))) idk = FeedModel.objects.all() for bar in idk: print(bar.url) foo = feedparser.parse(bar.url) for post in foo.entries: time.sleep(10) parsed_url = urlcanon.parse_url(post.link) og = OpenGraph(url=post.link) try: category = model.predict([post.title]) Article.objects.add_article(post.title, post.description, parsed_url, og.image, bar.title, category) logger.info("Article Added") except: logger.info("Did Not Work") continue
def fetch_og_preview(content, urls): """Fetch first opengraph entry for a list of urls.""" for url in urls: # See first if recently cached already if OpenGraphCache.objects.filter(url=url, modified__gte=now() - datetime.timedelta(days=7)).exists(): opengraph = OpenGraphCache.objects.get(url=url) Content.objects.filter(id=content.id).update(opengraph=opengraph) return opengraph try: og = OpenGraph(url=url, parser="lxml") except AttributeError: continue if not og or ("title" not in og and "site_name" not in og and "description" not in og and "image" not in og): continue try: title = og.title if "title" in og else og.site_name if "site_name" in og else "" description = og.description if "description" in og else "" image = og.image if "image" in og else "" try: with transaction.atomic(): opengraph = OpenGraphCache.objects.create( url=url, title=truncatechars(safe_text(title), 120), description=truncatechars(safe_text(description), 500), image=safe_text(image), ) except DataError: continue except IntegrityError: # Some other process got ahead of us opengraph = OpenGraphCache.objects.get(url=url) Content.objects.filter(id=content.id).update(opengraph=opengraph) return opengraph Content.objects.filter(id=content.id).update(opengraph=opengraph) return opengraph return False
def parse(self, url: str) -> dict: og = OpenGraph(url=url) self.json_markup = og.__data__ return self.og_str_markup
def test_str_repr(self): og = OpenGraph(html=self.test_document) text_of_data = og.__data__.__str__() self.assertEqual(str(og), text_of_data)
def test_contains(self): og = OpenGraph(html=self.test_document) self.assertIn('title', og)
def test_get_attr(self): og = OpenGraph(html=self.test_document) self.assertEqual(og.title, 'Test title') with self.assertRaises(AttributeError): og.attribute_does_not_exist
from metadata_parser import MetadataParser from opengraph import OpenGraph import webpreview url = 'https://health.usnews.com/wellness/health-buzz/articles/2018-01-05/smelling-your-partners-shirt-could-decrease-your-stress-levels-study-says' page = MetadataParser(url=url) print page.metadata print page.get_metadata('title') og = OpenGraph(url=url) print og wb = webpreview.OpenGraph(url, ['og:title', 'og:description']) print wb.title print wb.description
def test_str_repr(self, document): og = OpenGraph(html=document) text_of_data = og._data.__str__() assert str(og) == text_of_data
def test_get_attr(self, document): og = OpenGraph(html=document) assert og.title == "Test title" with pytest.raises(AttributeError): # noinspection PyStatementEffect og.attribute_does_not_exist
class Polygon(Drawable, Named, Scalable): def __init__(self, node_list): Drawable.__init__(self) Named.__init__(self) self.__node_list = node_list cycle_graph = nx.Graph() if node_list: for n in range(1, len(node_list)): cycle_graph.add_edge(node_list[n - 1], node_list[n]) self.style().set_target_type("fill") self.__frame = OpenGraph(cycle_graph) self.__frame.style().set_target_type("frame") return def reduce_nodes(self): self.__node_list = reduce_path(self.__node_list) def contains(self, obj): if type(obj) == Polygon: # check if all nodes are inside for p in obj.nodes(): if not self.contains(p): return False # check for edge intersections outer_edges = [Edge(self.nodes()[i], self.nodes()[i + 1]) for i in range(0, len(self.nodes()) - 1)] inner_edges = [Edge(obj.nodes()[i], obj.nodes()[i + 1]) for i in range(0, len(obj.nodes()) - 1)] for inner_edge in inner_edges: for outer_edge in outer_edges: if inner_edge.intersects(outer_edge): return False return True else: n = len(self.__node_list) inside = False x, y = obj p1x, p1y = self.__node_list[0] for i in range(n + 1): p2x, p2y = self.__node_list[i % n] if y > min(p1y, p2y): if y <= max(p1y, p2y): if x <= max(p1x, p2x): xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x if p1x == p2x or x <= xinters: inside = not inside p1x, p1y = p2x, p2y return inside def nodes(self): return self.__node_list def edges(self): return [(self.__node_list[i], self.__node_list[i + 1]) for i in range(0, len(self.__node_list) - 1)] def has_edge(self, start, end): i = 0 while True: try: i = self.__node_list.index(start, i + 1) if self.__node_list[i - 1] == end or (i < len(self.__node_list) - 1 and self.__node_list[i + 1] == end): return True except ValueError: return False def max(self): return (max([n[0] for n in self.__node_list]), max([n[1] for n in self.__node_list])) def min(self): return (min([n[0] for n in self.__node_list]), min([n[1] for n in self.__node_list])) def scale(self, scale): for n in range(0, len(self.__node_list)): node = self.__node_list[n] self.__node_list[n] = node * scale self.__frame.scale(scale) def frame(self): return self.__frame
def run(self, raw_data, objects): graphs = [] new_objects = [] for overlay in self.__sub_overlays: g = overlay.substitutes(raw_data) graphs.append(g) graph = graphs.pop(0) for h in graphs: graph = nx.compose(graph, h) components = nx.connected_component_subgraphs(graph) closed_polygons = [] polygons = [] for component in components: minimum_cycles = planar_cycles(component) collected_options = itertools.chain( *nx.get_node_attributes(component, 'options').values()) options = list(set(collected_options)) # the polygons are the same as the minimum cycles closed_polygons += minimum_cycles path_graph = nx.Graph() path_graph.add_nodes_from(component.nodes()) for polygon in minimum_cycles: polygons.append(Polygon(polygon, options)) path_graph.add_cycle(polygon) remaining_graph = nx.difference(component, path_graph) for n in remaining_graph.nodes(): if remaining_graph.degree(n) == 0: remaining_graph.remove_node(n) if len(remaining_graph.edges()) > 0: remaining_components = nx.connected_component_subgraphs( remaining_graph) for c in remaining_components: new_objects.append(OpenGraph(c, options)) new_objects = new_objects + polygons z_order_graph = nx.DiGraph() z_order_graph.add_nodes_from(new_objects) for i in range(0, len(polygons)): polygon1 = polygons[i] for j in range(i + 1, len(polygons)): polygon2 = polygons[j] if polygon1 != polygon2: if polygon1.contains(polygon2): z_order_graph.add_edge(polygon1, polygon2) elif polygon2.contains(polygon1): z_order_graph.add_edge(polygon2, polygon1) for obj in new_objects: for edge in obj.edges(): if 'below' in graph[edge[0]][edge[1]]: below = graph[edge[0]][edge[1]]['below'] if below != None: for obj_above in new_objects: if obj != obj_above: if obj_above.has_edge(below.start(), below.end()): z_order_graph.add_edge(obj, obj_above) if 'above' in graph[edge[0]][edge[1]]: above = graph[edge[0]][edge[1]]['above'] if above != None: for obj_below in new_objects: if obj != obj_below: if obj_below.has_edge(above.start(), above.end()): z_order_graph.add_edge(obj_below, obj) if 'z_order' in graph[edge[0]][edge[1]]: z_order = graph[edge[0]][edge[1]]['z_order'] if z_order != None: for other_obj in new_objects: if obj != other_obj: if (isinstance(other_obj, Polygon) and other_obj.frame().intersects(edge) ) or (isinstance(other_obj, OpenGraph) and other_obj.intersects(edge)): if z_order == 'above': z_order_graph.add_edge(other_obj, obj) elif z_order == 'below': z_order_graph.add_edge(obj, other_obj) else: raise ValueError, "Wrong value for z_order." cycle_gen = nx.simple_cycles(z_order_graph) try: cycles = list(cycle_gen) for cycle in cycles: cycle_edges = [(cycle[i], cycle[i + 1]) for i in range(0, len(cycle) - 1)] for edge in cycle_edges: z_order_graph.remove_edge(edge[0], edge[1]) if cycles: warnings.warn( "The diagram contains objects. that have an ambiguous z-order. Shaape estimates their z-order.", RuntimeWarning) except: pass current_z_order = 0 while z_order_graph.nodes(): nodes_without_predecessors = [ node for node in z_order_graph.nodes() if not z_order_graph.predecessors(node) ] for node in nodes_without_predecessors: node.set_z_order(current_z_order) current_z_order = current_z_order + 1 z_order_graph.remove_nodes_from(nodes_without_predecessors) for o in new_objects: if type(o) == Polygon or type(o) == OpenGraph: o.reduce_nodes() objects = objects + new_objects objects.append(graph) self._objects = objects self._parsed_data = raw_data return
def test_returns_none_on_get_exception(self, mock_logger, mock_get): assert OpenGraph(url=URL)._data == {} assert mock_logger.called
def make_post(): if (session.get('exists') == True): if request.method == 'POST': media = request.files['media_post'] file_name = media.filename content_post = request.form['content_post'] location_post = request.form['location_post'] link_post = request.form['link_post'] post_type = None image_prediction = None if (file_name != ''): post_type = "media" if (file_name.split(".")[1] == "mp4"): post_type = "video" # If the post contains image as well as a link, it is a media post with a link in the body if (file_name != '' and link_post != ''): post_type = "media" content_post += '<br><a href=#>' + link_post + '</a>' if (file_name.split(".")[1] == "mp4"): post_type = "video" # A pure link body if (file_name == '' and link_post != ''): post_type = 'link' if (file_name == "" and link_post == ""): post_type = "text" # Save the media if file type is media file_path = '' if (post_type == "media" or post_type == "video"): directory = "static/" + app.config[ 'UPLOAD_FOLDER'] + session.get('username').lower() if not os.path.exists(directory): os.mkdir(directory) file_path = directory + "/" + file_name media.save(file_path) image_prediction = prediction_models.predict_image( image_model, file_path) description = '' title = '' image = '' if (post_type == "link"): og = OpenGraph(url=link_post) description = og.description title = og.title image = og.image post_content_prediction = prediction_models.predict_text( text_model, content_post, device) content = { 'posttype': post_type, 'medialink': file_path, 'postcontent': content_post, 'postlocation': location_post, 'postlink': link_post, 'image_prediction': image_prediction, 'text_prediction': post_content_prediction, 'link_details': { 'description': description, 'image': image, 'title': title, } } inserted = db.create_post(session.get('username').lower(), content) print("Post ID:" + str(inserted) + " inserted") return redirect(url_for('feed_home')) else: return redirect(url_for('login'))
def _parse_open_graph(article): og = OpenGraph(html=article.html) if not og.is_valid(): return if og["type"] != "article": raise NotImplementedError("Cannot parse a OG type: %s" % og["type"]) og.setdefault(None) article.title = article.title or og.get("title") article.summary = article.summary or og.get("description") article.images = article.images or [og.get("image")] article.meta_lang = article.meta_lang or og.get("locale") article.keywords = article.keywords or og.get("tag") article.categories = article.categories or [og.get("category")] article.authors = article.authors or [og.get("author")] article.pub_date = article.pub_date or og.get("modified_date")
def extract(): doc = request.get_data(as_text=True) if doc is None or len(doc) == 0: return json.dumps({}) og = OpenGraph(html=doc) return Response(json.dumps(og.__data__), mimetype='application/json')
def test_contains(self, document): og = OpenGraph(html=document) assert "title" in og
class Polygon(Drawable, Named, Scalable): def __init__(self, node_list, options=[]): Drawable.__init__(self, options) Named.__init__(self) self.__node_list = node_list cycle_graph = nx.Graph() if node_list: for n in range(1, len(node_list)): cycle_graph.add_edge(node_list[n - 1], node_list[n]) self.style().set_target_type('fill') self.__frame = OpenGraph(cycle_graph, options) self.__frame.style().set_target_type('frame') return def reduce_nodes(self): self.__node_list = reduce_path(self.__node_list) def contains(self, obj): if type(obj) == Polygon: # check if all nodes are inside for p in obj.nodes(): if not self.contains(p): return False # check for edge intersections outer_edges = [ Edge(self.nodes()[i], self.nodes()[i + 1]) for i in range(0, len(self.nodes()) - 1) ] inner_edges = [ Edge(obj.nodes()[i], obj.nodes()[i + 1]) for i in range(0, len(obj.nodes()) - 1) ] for inner_edge in inner_edges: for outer_edge in outer_edges: if inner_edge.intersects(outer_edge): return False return True else: n = len(self.__node_list) inside = False x, y = obj p1x, p1y = self.__node_list[0] for i in range(n + 1): p2x, p2y = self.__node_list[i % n] if y > min(p1y, p2y): if y <= max(p1y, p2y): if x <= max(p1x, p2x): xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x if p1x == p2x or x <= xinters: inside = not inside p1x, p1y = p2x, p2y return inside def nodes(self): return self.__node_list def edges(self): return [(self.__node_list[i], self.__node_list[i + 1]) for i in range(0, len(self.__node_list) - 1)] def has_edge(self, start, end): i = 0 while True: try: i = self.__node_list.index(start, i + 1) if self.__node_list[i - 1] == end or ( i < len(self.__node_list) - 1 and self.__node_list[i + 1] == end): return True except ValueError: return False def max(self): return (max([n[0] for n in self.__node_list]), max([n[1] for n in self.__node_list])) def min(self): return (min([n[0] for n in self.__node_list]), min([n[1] for n in self.__node_list])) def scale(self, scale): for n in range(0, len(self.__node_list)): node = self.__node_list[n] self.__node_list[n] = node * scale self.__frame.scale(scale) def frame(self): return self.__frame
def test_uses_timeout(self, mock_get): OpenGraph(url=URL) mock_get.assert_called_once_with(URL, headers=DEFAULT_HEADERS, timeout=10) mock_get.reset_mock() OpenGraph(url=URL, timeout=123) mock_get.assert_called_once_with(URL, headers=DEFAULT_HEADERS, timeout=123)