-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
105 lines (82 loc) · 3.28 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import codecs
import json
import urllib.request
import urllib.response
import urllib.parse
from lxml import html
output_dir = 'output'
def read_film_list(page_index):
website = 'http://www.tasteofcinema.com/category/lists/film-lists/page/' + str(page_index) + '/'
html = get_url_content(website)
print("Parsing: ", website)
from toc_parser.list_parser import TocListParser
parser = TocListParser()
parser.feed(html.decode('utf-8'))
parser.close()
return parser.get_film_list()
def get_url_content(website):
headers = {'Accept-Charset': 'utf-8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'}
request = urllib.request.Request(website, headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
response.close()
return html
def write_lists_to_json(film_list, file_name):
file_path = output_dir + '/' + file_name
with codecs.open(file_path, "w", 'utf-8') as output_file:
json.dump(film_list, output_file, ensure_ascii=False, indent=4)
def read_lists_from_json(file_name):
file_path = output_dir + '/' + file_name
with codecs.open(file_path, "r", 'utf-8') as input_file:
json_load = json.load(input_file)
return json_load
def remove_bad_results(movies):
index = movies.index("Subscribe via ")
return movies[:index]
def get_movies(list_url):
content = get_url_content(list_url)
tree = html.fromstring(content)
movies = tree.xpath('//span[@style="font-family: Helvetica; font-size: 20px;"]/text()')
movies = remove_bad_results(movies)
print("Parsing... " + list_url)
other_pages = tree.xpath('//p[@class="pages"]/span/a/@href')
return movies, other_pages
# def get_imdb_link_google(movie):
# imdb_prefix = 'http://www.imdb'
# google_search_page = 'http://www.google.ro/search?q='
# url_encoded_movie = urllib.parse.quote_plus(movie + " imdb")
# url = google_search_page + url_encoded_movie
# content = get_url_content(url)
# tree = html.fromstring(content)
# text = tree.xpath('.//ol[@id="rso"]/div[1]/div[1]//cite/text()')
# imdb_link = imdb_prefix + text[1]
# print("Search for:", movie, " link:", imdb_link)
# return imdb_link
def get_imdb_link_fom_omdb(movie, imdb_movies, notfound):
title = movie['title']
if not title:
return
for year in movie['years']:
if year == 'N/A':
year = ''
json_load = get_movie_json(title, year)
if json_load['Response'] == 'True':
id = json_load['imdbID']
imdb_movies[id] = json_load
else:
if not year:
notfound.append(title + "; " + year)
else:
json_load = get_movie_json(title, '')
if json_load['Response'] == 'True':
id = json_load['imdbID']
imdb_movies[id] = json_load
else:
notfound.append(title + "; " + year)
def get_movie_json(title, year):
url = "http://www.omdbapi.com/?" + "t=" + urllib.parse.quote_plus(title) + "&y=" + year + "&r=json"
content = get_url_content(url)
json_load = json.loads(content.decode())
print(url, json_load['Response'])
return json_load