forked from jvanasco/metadata_parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo.py
73 lines (60 loc) · 2.38 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from metadata_parser import MetadataParser
# hey use lxml >= 2.3.5 ; use 3.x though!
# otherwise this site will break ! http://www.nasa.gov/externalflash/discovery/index.html
if 0:
a = MetadataParser(url='http://cnn.com')
print(a.get_metadata('title'))
b = MetadataParser(url='http://nyt.com')
print(b.get_metadata('title'))
c = MetadataParser(url='http://thedailybeast.com')
print(c.get_metadata('title'))
print("\n-------------------------------------------------------\n")
print(a.metadata)
print("\n-------------------------------------------------------\n")
print(b.metadata)
print("\n-------------------------------------------------------\n")
print(c.metadata)
print("\n-------------------------------------------------------\n")
print(c.get_metadata('title'))
print(c.get_metadata('canonical'))
print(c.get_metadata('url'))
print(c.absolute_url(c.get_metadata('canonical')))
print(c.absolute_url(c.get_metadata('url')))
print(c.get_discrete_url())
if 0:
a = MetadataParser(url='http://liqr.co/rsvpnewyork')
print("title:")
print(a.get_metadata('title'))
print("canonical:")
print(a.get_metadata('canonical'))
print("url:")
print(a.get_metadata('url'))
print("absolute_url-canonical:")
print(a.absolute_url(a.get_metadata('canonical')))
print("absolute_url-url:")
print(a.absolute_url(a.get_metadata('url')))
print("get_discrete_url:")
print(a.get_discrete_url())
if 0:
a = MetadataParser(url='http://www.ted.com/talks/drew_curtis_how_i_beat_a_patent_troll.html')
print(a.__dict__)
if 0:
broken_html = open('broken.html', 'r').read()
# a= MetadataParser(url="http://brewskeeball.com/rosenblog")
a = MetadataParser(html=broken_html)
print(a.get_metadata('title'))
if 0:
urls = [
'http://www.cnn.com',
'http://www.cnn.com/',
'http://www.michaeleisen.org/blog/?p=358',
'http://www.nasa.gov/externalflash/discovery/index.html',
'http://hw.libsyn.com/p/d/d/6/dd6b0db2d4858640/ARIYNBF_107_JamesGunn.mp3?sid=78edb823ad1b62ff6f329d68bbb2cc6a&l_sid=35168&l_eid=&l_mid=2952818&expiration=1334720066&hwt=7acfe1754c8dedc4f134b473894c9208'
]
for i in urls:
a = MetadataParser(url=i)
print(a.__dict__)
if 0:
url = 'http://soundcloud.com/electricyouthmusic'
a = MetadataParser(url=url)
print(a.__dict__)