def getMovieCodeByAPI(urlOpener, mTitle, mYear): findList = [] sUrlAdd = urllib.urlencode({"q": mTitle}) urlAdr = IMDBbyTitleAPI + sUrlAdd try: url = urlOpener.open(urlAdr) except: print("UrlOpener error: Unable to open: " + urlAdr) return "ce0000000", "Connection error", 0 IMDBfoundAPI = minidom.parseString(url.read()) movies = IMDBfoundAPI.getElementsByTagName("ImdbEntity") for movie in movies: description = movie.getElementsByTagName("Description") for a in description: if (a.childNodes[0].nodeValue[:4]).isnumeric: # Some times isnumeric is not working ¿unicode type problem? sYear = a.childNodes[0].nodeValue[:4] sYear = filter(type(sYear).isdigit, sYear) sYear = "0" + sYear year = int(sYear) movieFoundTitle = movie.childNodes[0].nodeValue movieFoundYear = year movieFoundCode = movie.getAttribute("id") titleRatio = sDiff(None, mTitle, movieFoundTitle).ratio() # print("API: " + str(movieFoundTitle) + " | " + str(movieFoundYear) + " | " + str(movieFoundCode) + " | Ratio: " + str(titleRatio)) findList.append({"code": movieFoundCode, "title": movieFoundTitle, "year": movieFoundYear, "ratio": titleRatio}) # Get the best match bestResult = {"code": "nm0000000", "title": "No match", "year": "0", "ratio": 0} for movie in findList: if abs(int(movie["year"]) - int(mYear)) <= 1: if bestResult["ratio"] < movie["ratio"]: bestResult["code"] = movie["code"] bestResult["title"] = movie["title"] bestResult["year"] = movie["year"] bestResult["ratio"] = movie["ratio"] if bestResult["ratio"] > 0.5: return bestResult["code"], bestResult["title"], bestResult["year"] else: # This is a very optimistic way of think, but for my movie list it works nice. if str(findList[0]["year"]) == mYear: optimistic_code = findList[0]["code"] optimistic_title = findList[0]["title"] optimistic_year = findList[0]["year"] return optimistic_code, optimistic_title, optimistic_year else: # No result. return None, None, None
def checkTitle(htmlCode, title, year): print("FIRE!") start = htmlCode.find("(") htmlCode = htmlCode[start + 1 : -1] try: htmlCode = ast.literal_eval(htmlCode) except: print("EVAL Error: ", htmlCode) return 0, "tt0000000", "Unknow Title", "0000" # "d" list contain an dictionary with all suggestion. Nice! this is what we want! for a in htmlCode["d"]: print(a["id"], a["l"], a["y"]) if "y" in a: ratio = sDiff(None, title, a["l"]).ratio() if a["y"] == int(year) and ratio > 0.8: print("Found: ", a["id"], a["l"], a["y"]) return 1, a["id"], a["l"], a["y"] return 0, "tt0000000", "Unknow Title", "0000"
def checkTitle(htmlCode, title, year): print("FIRE!") start = htmlCode.find("(") htmlCode = htmlCode[start + 1:-1] try: htmlCode = ast.literal_eval(htmlCode) except: print("EVAL Error: ", htmlCode) return 0, "tt0000000", "Unknow Title", "0000" # "d" list contain an dictionary with all suggestion. Nice! this is what we want! for a in htmlCode["d"]: print(a["id"], a["l"], a["y"]) if ("y" in a): ratio = sDiff(None, title, a["l"]).ratio() if (a["y"] == int(year) and ratio > 0.8): print("Found: ", a["id"], a["l"], a["y"]) return 1, a["id"], a["l"], a["y"] return 0, "tt0000000", "Unknow Title", "0000"
def getMovieCode(urlOpener, mTitle, mYear): findList = [] sUrlAdd = urllib.urlencode({"q": mTitle, "s": "all"}) urlAdr = IMDBakas + "find?" + sUrlAdd try: url = urlOpener.open(urlAdr) except: return "ee0000000", "Connection error", 0 urlAdrRed = url.geturl() if urlAdrRed.find("/title/tt") != -1: mCode = urlAdrRed[ urlAdrRed.find("/title/tt") + len("/title/tt") - 2 : urlAdrRed.find("/title/tt") + len("/title/tt") + 7 ] pattern = re.compile( 'itemprop="name">([\W\w]*?)<span class="nobr">[\n\w\W]*?\((<a href="\/year\/\d+\/">)?(\d+)(<\/a>)?\)<\/span>' ) match = pattern.search(url.read()) try: movieTitle = match.group(1) movieTitle = movieTitle.replace("\n", "") movieTitle = htmlFilter(movieTitle) except: movieTitle = "Not Found" try: movieYear = match.group(2) movieYear = movieYear.replace("\n", "") except: movieYear = "0" return mCode, movieTitle, movieYear else: pattern = re.compile("link=\/title\/tt(\d+)\/" + "'" + ';">([\w\W]+?)<\/a>\s\((\d+)[\w\W]*?\)([\w\W]+?)<\/td>') iterator = pattern.finditer(url.read()) for result in iterator: movieCode = "tt" + str(result.group(1)) movieYear = result.group(3) movieTitle = result.group(2) movieTitle = htmlFilter(movieTitle) badCapture = movieTitle.find(';">') while badCapture != -1: # While used to fix problems with results with images (Remember KISS). movieTitle = movieTitle[badCapture + 3 :] badCapture = movieTitle.find(';">') titleRatio = sDiff(None, mTitle, movieTitle).ratio() findList.append({"code": movieCode, "title": movieTitle, "year": movieYear, "ratio": titleRatio}) # Find AKAS tittles. pattern = re.compile('aka "([\w\W]+?)"') akasIterator = pattern.finditer(result.group(4)) for akaResult in akasIterator: akaTitle = akaResult.group(1) akaTitle = htmlFilter(akaTitle) titleRatio = sDiff(None, mTitle, akaTitle).ratio() findList.append({"code": movieCode, "title": akaTitle, "year": movieYear, "ratio": titleRatio}) bestResult = {"code": "nm0000000", "title": "No match", "year": "0", "ratio": 0} for movie in findList: if abs(int(movie["year"]) - int(mYear)) <= 1: if bestResult["ratio"] < movie["ratio"]: bestResult["code"] = movie["code"] bestResult["title"] = movie["title"] bestResult["year"] = movie["year"] bestResult["ratio"] = movie["ratio"] if bestResult["ratio"] > 0.5: return bestResult["code"], bestResult["title"], bestResult["year"] else: return "bm0000000", "Bad matches", 0
def getMovieCodeByAPI(urlOpener, mTitle, mYear): findList = [] sUrlAdd = urllib.urlencode({'q': mTitle}) urlAdr = IMDBbyTitleAPI + sUrlAdd try: url = urlOpener.open(urlAdr) except: print("UrlOpener error: Unable to open: " + urlAdr) return "ce0000000", "Connection error", 0 IMDBfoundAPI = minidom.parseString(url.read()) movies = IMDBfoundAPI.getElementsByTagName("ImdbEntity") for movie in movies: description = movie.getElementsByTagName("Description") for a in description: if (a.childNodes[0].nodeValue[:4]).isnumeric: # Some times isnumeric is not working ¿unicode type problem? sYear = a.childNodes[0].nodeValue[:4] sYear = filter(type(sYear).isdigit, sYear) sYear = "0" + sYear year = int(sYear) movieFoundTitle = movie.childNodes[0].nodeValue movieFoundYear = year movieFoundCode = movie.getAttribute("id") titleRatio = sDiff(None, mTitle, movieFoundTitle).ratio() # print("API: " + str(movieFoundTitle) + " | " + str(movieFoundYear) + " | " + str(movieFoundCode) + " | Ratio: " + str(titleRatio)) findList.append({ 'code': movieFoundCode, 'title': movieFoundTitle, 'year': movieFoundYear, 'ratio': titleRatio }) # Get the best match bestResult = { 'code': 'nm0000000', 'title': 'No match', 'year': '0', 'ratio': 0 } for movie in findList: if abs(int(movie['year']) - int(mYear)) <= 1: if bestResult['ratio'] < movie['ratio']: bestResult['code'] = movie['code'] bestResult['title'] = movie['title'] bestResult['year'] = movie['year'] bestResult['ratio'] = movie['ratio'] if bestResult['ratio'] > 0.5: return bestResult['code'], bestResult['title'], bestResult['year'] else: # This is a very optimistic way of think, but for my movie list it works nice. if str(findList[0]['year']) == mYear: optimistic_code = findList[0]['code'] optimistic_title = findList[0]['title'] optimistic_year = findList[0]['year'] return optimistic_code, optimistic_title, optimistic_year else: # No result. return None, None, None
def getMovieCode(urlOpener, mTitle, mYear): findList = [] sUrlAdd = urllib.urlencode({'q': mTitle, 's': 'all'}) urlAdr = IMDBakas + "find?" + sUrlAdd try: url = urlOpener.open(urlAdr) except: return "ce0000000", "Connection error", 0 urlAdrRed = url.geturl() urlHTML = url.read() if urlAdrRed.find("/title/tt") != -1: mCode = urlAdrRed[urlAdrRed.find("/title/tt") + len("/title/tt") - 2:urlAdrRed.find("/title/tt") + len("/title/tt") + 7] pattern = re.compile( 'itemprop="name">([\W\w]*?)<span class="nobr">[\n\w\W]*?\((<a href="\/year\/\d+\/">)?(\d+)(<\/a>)?\)<\/span>' ) match = pattern.search(urlHTML) try: movieTitle = match.group(1) movieTitle = movieTitle.replace("\n", "") movieTitle = htmlFilter(movieTitle) except: movieTitle = "Not Found" try: movieYear = match.group(2) movieYear = movieYear.replace("\n", "") except: movieYear = "0" return mCode, movieTitle, movieYear else: pattern = re.compile( '"result_text"> <a href="\/title\/(tt\d+).*?>(.+?)<\/a>.*?\((\d+)\)', re.MULTILINE | re.DOTALL) iterator = pattern.finditer(urlHTML) for result in iterator: movieCode = str(result.group(1)) movieYear = result.group(3) movieTitle = result.group(2) movieTitle = htmlFilter(movieTitle) badCapture = movieTitle.find(';">') while badCapture != -1: # While used to fix problems with results with images (Remember KISS). movieTitle = movieTitle[badCapture + 3:] badCapture = movieTitle.find(';">') titleRatio = sDiff(None, mTitle, movieTitle).ratio() findList.append({ 'code': movieCode, 'title': movieTitle, 'year': movieYear, 'ratio': titleRatio }) # Find AKAS tittles. pattern = re.compile( '"result_text"> <a href="\/title\/(tt\d+).*?\((\d+)\).*?aka.*?"(.*?)"', re.MULTILINE | re.DOTALL) akasIterator = pattern.finditer(urlHTML) for akaResult in akasIterator: akaTitle = akaResult.group(3) akaTitle = htmlFilter(akaTitle) akaYear = akaResult.group(2) akaMovieCode = akaResult.group(1) titleRatio = sDiff(None, mTitle, akaTitle).ratio() findList.append({ 'code': akaMovieCode, 'title': akaTitle, 'year': akaYear, 'ratio': titleRatio }) # Get the best match bestResult = { 'code': 'nm0000000', 'title': 'No match', 'year': '0', 'ratio': 0 } for movie in findList: if abs(int(movie['year']) - int(mYear)) <= 1: if bestResult['ratio'] < movie['ratio']: bestResult['code'] = movie['code'] bestResult['title'] = movie['title'] bestResult['year'] = movie['year'] bestResult['ratio'] = movie['ratio'] if bestResult['ratio'] > 0.5: return bestResult['code'], bestResult['title'], bestResult['year'] else: return "bm0000000", "Bad matches", 0
def getMovieCode(self, mTitle, mYear): findList = [] intento = 0 while intento < MAX_RETRY: try: webResponse = self.webSession.get( self.IMDBakas + "find", params={"ref_": "nv_sr_fn", "q": mTitle.encode("utf-8"), "s": "all"} ) intento = 99 except: intento = intento + 1 if intento == MAX_RETRY: print ("ERROR FOUND: Connection failed at imdb.getMovieCode() - " + mTitle + "(" + mYear + ")") return self.ImdbFoundMovie(result=self.ImdbFoundMovie.RESULT_BAD_MATCH) urlAdrRed = webResponse.url urlHTML = webResponse.text if urlAdrRed.find("/title/tt") != -1: mCode = urlAdrRed[ urlAdrRed.find("/title/tt") + len("/title/tt") - 2 : urlAdrRed.find("/title/tt") + len("/title/tt") + 7 ] pattern = re.compile( 'itemprop="name">([\W\w]*?)<\/span>[\n\w\W]*?<span class="nobr">[\n\w\W]*?\(<a href="[\w\W]+?>(\d+)<\/a>' ) match = pattern.search(urlHTML) try: movieTitle = match.group(1) movieTitle = movieTitle.replace("\n", "") except: movieTitle = "Not Found" try: movieYear = match.group(2) movieYear = movieYear.replace("\n", "") except: movieYear = "0" return self.ImdbFoundMovie(mCode, movieTitle, movieYear) else: pattern = re.compile( '"result_text"> <a href="\/title\/(tt\d+).*?>(.+?)<\/a>.*?\((\d+)\)', re.MULTILINE | re.DOTALL ) iterator = pattern.finditer(urlHTML) for result in iterator: movieCode = str(result.group(1)) movieYear = result.group(3) movieTitle = result.group(2) badCapture = movieTitle.find(';">') while badCapture != -1: movieTitle = movieTitle[badCapture + 3 :] badCapture = movieTitle.find(';">') titleRatio = sDiff(None, mTitle, movieTitle).ratio() findList.append(self.ImdbFoundMovie(code=movieCode, title=movieTitle, year=movieYear, ratio=titleRatio)) # Find AKAS tittles. pattern = re.compile( '"result_text"> <a href="\/title\/(tt\d+).*?\((\d+)\).*?aka.*?"(.*?)"', re.MULTILINE | re.DOTALL ) akasIterator = pattern.finditer(urlHTML) for akaResult in akasIterator: akaTitle = akaResult.group(3) akaYear = akaResult.group(2) akaMovieCode = akaResult.group(1) titleRatio = sDiff(None, mTitle, akaTitle).ratio() if akaTitle.find(mTitle) != -1: titleRatio2 = 0.6 else: titleRatio2 = 0 titleRatio = max(titleRatio, titleRatio2) findList.append(self.ImdbFoundMovie(code=akaMovieCode, title=akaTitle, year=akaYear, ratio=titleRatio)) # Get the best match bestResult = self.ImdbFoundMovie(ratio=0, result=self.ImdbFoundMovie.RESULT_NO_MATCH) for movie in findList: if abs(int(movie.get_year()) - int(mYear)) <= 1: if bestResult.get_ratio() < movie.get_ratio(): bestResult = movie if bestResult.get_ratio() > 0.5: return bestResult else: return self.ImdbFoundMovie(result=self.ImdbFoundMovie.RESULT_BAD_MATCH)
def getMovieCodeByAPI(self, mTitle, mYear): findList = [] intento = 0 while intento < MAX_RETRY: try: webResponse = self.webSession.get(self.IMDBbyTitleAPI, params={"q": mTitle.encode("utf-8")}) intento = 99 except: intento = intento + 1 if intento == MAX_RETRY: print ("ERROR FOUND: Connection failed at imdb.getMovieCodeByAPI() - " + mTitle + " (" + mYear + ")") return self.ImdbFoundMovie(result=self.ImdbFoundMovie.RESULT_BAD_MATCH) # urlHTML = webResponse.text # Let's some magic to happen soup = BeautifulSoup(webResponse.text) urlHTML = soup.get_text() # Magic is done # I will be very glad if someone explain me why does it work IMDBfoundAPI = minidom.parseString(urlHTML) movies = IMDBfoundAPI.getElementsByTagName("ImdbEntity") for movie in movies: description = movie.getElementsByTagName("Description") for a in description: rc = [] for node in a: if node.nodeType == node.TEXT_NODE: rc.append(node.data) print "".join(rc) if (a.childNodes[0].nodeValue[:4]).isnumeric: # Some times isnumeric is not working ¿unicode type problem? sYear = a.childNodes[0].nodeValue[:4] sYear = filter(type(sYear).isdigit, sYear) sYear = "0" + sYear year = int(sYear) movieFoundTitle = movie.childNodes[0].nodeValue movieFoundYear = year movieFoundCode = movie.getAttribute("id") titleRatio = sDiff(None, mTitle, movieFoundTitle).ratio() # print("API: " + str(movieFoundTitle) + " | " + str(movieFoundYear) + " | " + str(movieFoundCode) + " | Ratio: " + str(titleRatio)) findList.append( self.ImdbFoundMovie(code=movieFoundCode, title=movieFoundTitle, year=movieFoundYear, ratio=titleRatio) ) # Get the best match bestResult = self.ImdbFoundMovie(ratio=0, result=self.ImdbFoundMovie.RESULT_NO_MATCH) for movie in findList: if abs(int(movie.get_year()) - int(mYear)) <= 1: if bestResult.get_ratio() < movie.get_ratio(): bestResult = movie if bestResult.get_ratio() > 0.5: return bestResult else: # This is a very optimistic way of think, but for my movie list it works nice. if len(findList) > 0: if str(findList[0].get_year()) == mYear: return findList[0] else: # No result. return self.ImdbFoundMovie(result=self.ImdbFoundMovie.RESULT_BAD_MATCH) else: # No result. return self.ImdbFoundMovie(result=self.ImdbFoundMovie.RESULT_BAD_MATCH)
def getMovieCodeByAPI(self, mTitle, mYear): findList = [] sUrlAdd = urllib.urlencode({'q': mTitle.encode('utf-8')}) urlAdr = self.IMDBbyTitleAPI + sUrlAdd intento = 0 while intento < MAX_RETRY: try: webResponse = self.webSession.open(urlAdr) intento = 99 except: intento = intento + 1 if intento == MAX_RETRY: print("ERROR FOUND: Connection failed at imdb.getMovieCodeByAPI() - " + mTitle + " (" + mYear + ")") return self.ImdbFoundMovie(result=self.ImdbFoundMovie.RESULT_BAD_MATCH) urlHTML = webResponse.read() # urlHTML = unicode(urlHTML, 'utf-8') IMDBfoundAPI = minidom.parseString(urlHTML) movies = IMDBfoundAPI.getElementsByTagName("ImdbEntity") for movie in movies: description = movie.getElementsByTagName("Description") for a in description: if (a.childNodes[0].nodeValue[:4]).isnumeric: # Some times isnumeric is not working ¿unicode type problem? sYear = a.childNodes[0].nodeValue[:4] sYear = filter(type(sYear).isdigit, sYear) sYear = "0" + sYear year = int(sYear) movieFoundTitle = movie.childNodes[0].nodeValue movieFoundYear = year movieFoundCode = movie.getAttribute("id") titleRatio = sDiff(None, mTitle, movieFoundTitle).ratio() # print("API: " + str(movieFoundTitle) + " | " + str(movieFoundYear) + " | " + str(movieFoundCode) + " | Ratio: " + str(titleRatio)) findList.append(self.ImdbFoundMovie(code=movieFoundCode, title=movieFoundTitle, year=movieFoundYear, ratio=titleRatio)) # Get the best match bestResult = self.ImdbFoundMovie(ratio=0, result=self.ImdbFoundMovie.RESULT_NO_MATCH) for movie in findList: if abs(int(movie.get_year()) - int(mYear)) <= 1: if bestResult.get_ratio() < movie.get_ratio(): bestResult = movie if bestResult.get_ratio() > 0.5: return bestResult else: # This is a very optimistic way of think, but for my movie list it works nice. if len(findList) > 0: if str(findList[0].get_year()) == mYear: return findList[0] else: # No result. return self.ImdbFoundMovie(result=self.ImdbFoundMovie.RESULT_BAD_MATCH) else: # No result. return self.ImdbFoundMovie(result=self.ImdbFoundMovie.RESULT_BAD_MATCH)
def get_title_ratio(self): try: titleRatio = sDiff(None, self.__search_title, self.__title).ratio() except: raise return titleRatio