def scan_company_names(name_list, name1, results=0, ro_thresold=None): """Scan a list of company names, searching for best matches against the given name. Notice that this function takes a list of strings, and not a list of dictionaries.""" if ro_thresold is not None: RO_THRESHOLD = ro_thresold else: RO_THRESHOLD = 0.6 sm1 = SequenceMatcher() sm1.set_seq1(name1.lower()) resd = {} withoutCountry = not name1.endswith(']') for i, n in name_list: # XXX: on Symbian, here we get a str; not sure this is the # right place to fix it. if isinstance(n, str): n = unicode(n, 'latin1', 'ignore') o_name = n var = 0.0 if withoutCountry and n.endswith(']'): cidx = n.rfind('[') if cidx != -1: n = n[:cidx].rstrip() var = -0.05 # Distance with the company name. ratio = ratcliff(name1, n, sm1) + var if ratio >= RO_THRESHOLD: if resd.has_key(i): if ratio > resd[i][0]: resd[i] = (ratio, (i, analyze_company_name(o_name))) else: resd[i] = (ratio, (i, analyze_company_name(o_name))) res = resd.values() res.sort() res.reverse() if results > 0: res[:] = res[:results] return res
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): _BaseParser = DOMBasicCompanyParser _notDirectHitTitle = '<title>find - imdb' _titleBuilder = lambda self, x: build_company_name(x) _linkPrefix = '/company/co' _attrs = [ Attribute(key='data', multi=True, path={ 'link': "./a[1]/@href", 'name': "./a[1]/text()", 'notes': "./text()[1]" }, postprocess=lambda x: (analyze_imdbid(x.get('link')), analyze_company_name(x.get('name') + (x.get('notes') or ''), stripNotes=True))) ] extractors = [ Extractor( label='search', path= "//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..", attrs=_attrs) ]
class DOMCompanyParser(DOMParserBase): """Parser for the main page of a given company. The page should be provided as a string, as taken from the akas.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example: cparser = DOMCompanyParser() result = cparser.parse(company_html_string) """ _containsObjects = True extractors = [ Extractor( label='name', path="//h1/span[@class='display-title ']", # note the extra trailing space in class attrs=Attribute( key='name', path="./text()", postprocess=lambda x: analyze_company_name(x, stripNotes=True) ) ), Extractor( label='filmography', group="//b/a[@name]", group_key="./text()", group_key_normalize=lambda x: x.lower(), path="../following-sibling::ol[1]/li", attrs=Attribute( key=None, multi=True, path={ 'link': "./a[1]/@href", 'title': "./a[1]/text()", 'year': "./text()[1]" }, postprocess=lambda x: build_movie( '%s %s' % (x.get('title'), x.get('year').strip()), movieID=analyze_imdbid(x.get('link') or ''), _parsingCompany=True ) ) ) ] preprocessors = [ (re.compile('(<b><a name=)', re.I), r'</p>\1') ] def postprocess_data(self, data): for key in list(data.keys()): new_key = key.replace('company', 'companies') new_key = new_key.replace('other', 'miscellaneous') new_key = new_key.replace('distributor', 'distributors') if new_key != key: data[new_key] = data[key] del data[key] return data
class DOMBasicCompanyParser(DOMBasicMovieParser): """Simply get the name of a company and the imdbID. It's used by the DOMHTMLSearchCompanyParser class to return a result for a direct match (when a search on IMDb results in a single company, the web server sends directly the company page. """ _titleFunct = lambda self, x: analyze_company_name(x or '')
class DOMCompanyParser(DOMParserBase): """Parser for the main page of a given company. The page should be provided as a string, as taken from the www.imdb.com server. The final result will be a dictionary, with a key for every relevant section. Example:: cparser = DOMCompanyParser() result = cparser.parse(company_html_string) """ _containsObjects = True rules = [ Rule( key='name', extractor=Path( '//h1/span[@class="display-title "]/text()', transform=lambda x: analyze_company_name(x, stripNotes=True))), Rule(key='filmography', extractor=Rules( foreach='//b/a[@name]', rules=[ Rule(key=Path('./text()', transform=str.lower), extractor=Rules( foreach='../following-sibling::ol[1]/li', rules=[ Rule(key='link', extractor=Path('./a[1]/@href')), Rule(key='title', extractor=Path('./a[1]/text()')), Rule(key='year', extractor=Path('./text()[1]')) ], transform=lambda x: build_movie( '%s %s' % (x.get('title'), x.get('year').strip()), movieID=analyze_imdbid(x.get('link') or ''), _parsingCompany=True))) ])) ] preprocessors = [(re.compile('(<b><a name=)', re.I), r'</p>\1')] def postprocess_data(self, data): for key in ['name']: if (key in data) and isinstance(data[key], dict): subdata = data[key] del data[key] data.update(subdata) for key in list(data.keys()): new_key = key.replace('company', 'companies') new_key = new_key.replace('other', 'miscellaneous') new_key = new_key.replace('distributor', 'distributors') if new_key != key: data[new_key] = data[key] del data[key] return data
class DOMBasicKeywordParser(DOMBasicMovieParser): """Simply get the name of a keyword. It's used by the DOMHTMLSearchKeywordParser class to return a result for a direct match (when a search on IMDb results in a single keyword, the web server sends directly the keyword page. """ # XXX: it's still to be tested! # I'm not even sure there can be a direct hit, searching for keywords. _titleFunct = lambda self, x: analyze_company_name(x or '')
class DOMBasicCompanyParser(DOMBasicMovieParser): """Simply get the name of a company and the imdbID. It's used by the DOMHTMLSearchCompanyParser class to return a result for a direct match (when a search on IMDb results in a single company, the web server sends directly the company page. """ _titleAttrPath = ".//text()" _linkPath = "//a[starts-with(@href, 'http://pro.imdb.com/company/')]" _titleFunct = lambda self, x: analyze_company_name(x or u'')
def start_a(self, attrs): href = self.get_attr_value(attrs, 'href') if not href: return href = href.lower() # FIXME: not every company page has a link to IMDbPro, so # _many_ companyIDs can't be retrieved at all. :-/ if '/company/co' in href and href.startswith('http://pro.'): rpid = self.re_imdbID.findall(href) if rpid and self._name: n = self._name.strip() pid = str(rpid[-1]) d = analyze_company_name(n) res = [(pid, d)] self.reset() self._result = res
def get_company_main(self, companyID): name = getCompanyName(companyID, '%scompanies.index' % self.__db, '%scompanies.data' % self.__db) if not name: raise IMDbDataAccessError, \ 'unable to get companyID "%s"' % companyID res = analyze_company_name(name) filmography = getCompanyFilmography(companyID, '%scompanies.index' % self.__db, '%scompanies.data' % self.__db, '%stitles.index' % self.__db, '%stitles.key' % self.__db) if filmography: res.update(filmography) return {'data': res}
def set_name(self, name): """Set the name of the company.""" # Company diverges a bit from other classes, being able # to directly handle its "notes". AND THAT'S PROBABLY A BAD IDEA! oname = name = name.strip() notes = '' if name.endswith(')'): fparidx = name.find('(') if fparidx != -1: notes = name[fparidx:] name = name[:fparidx].rstrip() if self.notes: name = oname d = analyze_company_name(name) self.data.update(d) if notes and not self.notes: self.notes = notes
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): """A parser for the company search page.""" rules = [ Rule(key='data', extractor=Rules( foreach='//td[@class="result_text"]', rules=[ Rule(key='link', extractor=Path('./a/@href', reduce=reducers.first)), Rule(key='name', extractor=Path('./a/text()')), Rule(key='notes', extractor=Path('./text()')) ], transform=lambda x: (analyze_imdbid(x.get('link')), analyze_company_name(x.get('name') + x.get('notes', ''), stripNotes=True)))) ]
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser): _linkPrefix = '/company/co' rules = [ Rule( key='data', extractor=Rules( foreach= '//td[@class="result_text"]/a[starts-with(@href, "/company/co")]/..', rules=[ Rule(key='link', extractor=Path('./a[1]/@href')), Rule(key='name', extractor=Path('./a[1]/text()')), Rule(key='notes', extractor=Path('./text()[1]')) ], transform=lambda x: (analyze_imdbid(x.get('link')), analyze_company_name(x.get('name') + (x.get('notes') or ''), stripNotes=True)))) ]
from imdb.parser.common.cutils import search_company_name def _scan_company_names(keyFile, name1, results=0): """Scan the given file, using the cutils.search_company_name C function, for a given name.""" name1 = name1.encode('latin_1', 'replace') try: st = search_company_name(keyFile, name1, results) except IOError, e: import warnings warnings.warn('unable to access companies information; ' 'please run the companies4local.py script: %s.' % e) return [] res = [] for x in st: tmpd = analyze_company_name(latin2utf(x[2])) res.append((x[0], (x[1], tmpd))) return res except ImportError: import warnings warnings.warn('Unable to import the cutils.search_company_name function.' ' Searching company names using the "local" data access' ' system will be a bit slower.') from imdb.parser.common.locsql import scan_company_names def _readCompanyNamsKeyFile(keyFile): """Iterate over the given file, returning tuples suited for the common.locsql.scan_company_names function.""" try: kf = open(keyFile, 'r') except IOError, e: raise IMDbDataAccessError, str(e)
def get_data(self): self._title = self._title.strip() if self._title: self._data.update(analyze_company_name(self._title)) return self._data
def _analyze_company_name(n, *args, **kwds): """analyze_company_name doesn't accept the 'canonical' paramter.""" return analyze_company_name(n, stripNotes=True)