Пример #1
0
def scan_company_names(name_list, name1, results=0, ro_thresold=None):
    """Scan a list of company names, searching for best matches against
    the given name.  Notice that this function takes a list of
    strings, and not a list of dictionaries."""
    if ro_thresold is not None: RO_THRESHOLD = ro_thresold
    else: RO_THRESHOLD = 0.6
    sm1 = SequenceMatcher()
    sm1.set_seq1(name1.lower())
    resd = {}
    withoutCountry = not name1.endswith(']')
    for i, n in name_list:
        # XXX: on Symbian, here we get a str; not sure this is the
        #      right place to fix it.
        if isinstance(n, str):
            n = unicode(n, 'latin1', 'ignore')
        o_name = n
        var = 0.0
        if withoutCountry and n.endswith(']'):
            cidx = n.rfind('[')
            if cidx != -1:
                n = n[:cidx].rstrip()
                var = -0.05
        # Distance with the company name.
        ratio = ratcliff(name1, n, sm1) + var
        if ratio >= RO_THRESHOLD:
            if resd.has_key(i):
                if ratio > resd[i][0]: resd[i] = (ratio,
                                            (i, analyze_company_name(o_name)))
            else:
                resd[i] = (ratio, (i, analyze_company_name(o_name)))
    res = resd.values()
    res.sort()
    res.reverse()
    if results > 0: res[:] = res[:results]
    return res
Пример #2
0
def scan_company_names(name_list, name1, results=0, ro_thresold=None):
    """Scan a list of company names, searching for best matches against
    the given name.  Notice that this function takes a list of
    strings, and not a list of dictionaries."""
    if ro_thresold is not None: RO_THRESHOLD = ro_thresold
    else: RO_THRESHOLD = 0.6
    sm1 = SequenceMatcher()
    sm1.set_seq1(name1.lower())
    resd = {}
    withoutCountry = not name1.endswith(']')
    for i, n in name_list:
        # XXX: on Symbian, here we get a str; not sure this is the
        #      right place to fix it.
        if isinstance(n, str):
            n = unicode(n, 'latin1', 'ignore')
        o_name = n
        var = 0.0
        if withoutCountry and n.endswith(']'):
            cidx = n.rfind('[')
            if cidx != -1:
                n = n[:cidx].rstrip()
                var = -0.05
        # Distance with the company name.
        ratio = ratcliff(name1, n, sm1) + var
        if ratio >= RO_THRESHOLD:
            if resd.has_key(i):
                if ratio > resd[i][0]:
                    resd[i] = (ratio, (i, analyze_company_name(o_name)))
            else:
                resd[i] = (ratio, (i, analyze_company_name(o_name)))
    res = resd.values()
    res.sort()
    res.reverse()
    if results > 0: res[:] = res[:results]
    return res
Пример #3
0
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
    _BaseParser = DOMBasicCompanyParser
    _notDirectHitTitle = '<title>find - imdb'
    _titleBuilder = lambda self, x: build_company_name(x)
    _linkPrefix = '/company/co'

    _attrs = [
        Attribute(key='data',
                  multi=True,
                  path={
                      'link': "./a[1]/@href",
                      'name': "./a[1]/text()",
                      'notes': "./text()[1]"
                  },
                  postprocess=lambda x:
                  (analyze_imdbid(x.get('link')),
                   analyze_company_name(x.get('name') + (x.get('notes') or ''),
                                        stripNotes=True)))
    ]

    extractors = [
        Extractor(
            label='search',
            path=
            "//td[@class='result_text']/a[starts-with(@href, '/company/co')]/..",
            attrs=_attrs)
    ]
Пример #4
0
class DOMCompanyParser(DOMParserBase):
    """Parser for the main page of a given company.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = DOMCompanyParser()
        result = cparser.parse(company_html_string)
    """
    _containsObjects = True

    extractors = [
        Extractor(
            label='name',
            path="//h1/span[@class='display-title ']",  # note the extra trailing space in class
            attrs=Attribute(
                key='name',
                path="./text()",
                postprocess=lambda x: analyze_company_name(x, stripNotes=True)
            )
        ),

        Extractor(
            label='filmography',
            group="//b/a[@name]",
            group_key="./text()",
            group_key_normalize=lambda x: x.lower(),
            path="../following-sibling::ol[1]/li",
            attrs=Attribute(
                key=None,
                multi=True,
                path={
                    'link': "./a[1]/@href",
                    'title': "./a[1]/text()",
                    'year': "./text()[1]"
                },
                postprocess=lambda x: build_movie(
                    '%s %s' % (x.get('title'), x.get('year').strip()),
                    movieID=analyze_imdbid(x.get('link') or ''),
                    _parsingCompany=True
                )
            )
        )
    ]

    preprocessors = [
        (re.compile('(<b><a name=)', re.I), r'</p>\1')
    ]

    def postprocess_data(self, data):
        for key in list(data.keys()):
            new_key = key.replace('company', 'companies')
            new_key = new_key.replace('other', 'miscellaneous')
            new_key = new_key.replace('distributor', 'distributors')
            if new_key != key:
                data[new_key] = data[key]
                del data[key]
        return data
Пример #5
0
class DOMBasicCompanyParser(DOMBasicMovieParser):
    """Simply get the name of a company and the imdbID.

    It's used by the DOMHTMLSearchCompanyParser class to return a result
    for a direct match (when a search on IMDb results in a single
    company, the web server sends directly the company page.
    """
    _titleFunct = lambda self, x: analyze_company_name(x or '')
class DOMCompanyParser(DOMParserBase):
    """Parser for the main page of a given company.
    The page should be provided as a string, as taken from
    the www.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example::

        cparser = DOMCompanyParser()
        result = cparser.parse(company_html_string)
    """
    _containsObjects = True

    rules = [
        Rule(
            key='name',
            extractor=Path(
                '//h1/span[@class="display-title "]/text()',
                transform=lambda x: analyze_company_name(x, stripNotes=True))),
        Rule(key='filmography',
             extractor=Rules(
                 foreach='//b/a[@name]',
                 rules=[
                     Rule(key=Path('./text()', transform=str.lower),
                          extractor=Rules(
                              foreach='../following-sibling::ol[1]/li',
                              rules=[
                                  Rule(key='link',
                                       extractor=Path('./a[1]/@href')),
                                  Rule(key='title',
                                       extractor=Path('./a[1]/text()')),
                                  Rule(key='year',
                                       extractor=Path('./text()[1]'))
                              ],
                              transform=lambda x: build_movie(
                                  '%s %s' %
                                  (x.get('title'), x.get('year').strip()),
                                  movieID=analyze_imdbid(x.get('link') or ''),
                                  _parsingCompany=True)))
                 ]))
    ]

    preprocessors = [(re.compile('(<b><a name=)', re.I), r'</p>\1')]

    def postprocess_data(self, data):
        for key in ['name']:
            if (key in data) and isinstance(data[key], dict):
                subdata = data[key]
                del data[key]
                data.update(subdata)
        for key in list(data.keys()):
            new_key = key.replace('company', 'companies')
            new_key = new_key.replace('other', 'miscellaneous')
            new_key = new_key.replace('distributor', 'distributors')
            if new_key != key:
                data[new_key] = data[key]
                del data[key]
        return data
Пример #7
0
class DOMBasicKeywordParser(DOMBasicMovieParser):
    """Simply get the name of a keyword.

    It's used by the DOMHTMLSearchKeywordParser class to return a result
    for a direct match (when a search on IMDb results in a single
    keyword, the web server sends directly the keyword page.
    """
    # XXX: it's still to be tested!
    # I'm not even sure there can be a direct hit, searching for keywords.
    _titleFunct = lambda self, x: analyze_company_name(x or '')
Пример #8
0
class DOMBasicCompanyParser(DOMBasicMovieParser):
    """Simply get the name of a company and the imdbID.

    It's used by the DOMHTMLSearchCompanyParser class to return a result
    for a direct match (when a search on IMDb results in a single
    company, the web server sends directly the company page.
    """
    _titleAttrPath = ".//text()"
    _linkPath = "//a[starts-with(@href, 'http://pro.imdb.com/company/')]"
    _titleFunct = lambda self, x: analyze_company_name(x or u'')
 def start_a(self, attrs):
     href = self.get_attr_value(attrs, 'href')
     if not href: return
     href = href.lower()
     # FIXME: not every company page has a link to IMDbPro, so
     #        _many_ companyIDs can't be retrieved at all. :-/
     if '/company/co' in href and href.startswith('http://pro.'):
         rpid = self.re_imdbID.findall(href)
         if rpid and self._name:
             n = self._name.strip()
             pid = str(rpid[-1])
             d = analyze_company_name(n)
             res = [(pid, d)]
             self.reset()
             self._result = res
Пример #10
0
 def get_company_main(self, companyID):
     name = getCompanyName(companyID, '%scompanies.index' % self.__db,
                           '%scompanies.data' % self.__db)
     if not name:
         raise IMDbDataAccessError, \
                         'unable to get companyID "%s"' % companyID
     res = analyze_company_name(name)
     filmography = getCompanyFilmography(companyID,
                                         '%scompanies.index' % self.__db,
                                         '%scompanies.data' % self.__db,
                                         '%stitles.index' % self.__db,
                                         '%stitles.key' % self.__db)
     if filmography:
         res.update(filmography)
     return {'data': res}
Пример #11
0
 def start_a(self, attrs):
     href = self.get_attr_value(attrs, 'href')
     if not href: return
     href = href.lower()
     # FIXME: not every company page has a link to IMDbPro, so
     #        _many_ companyIDs can't be retrieved at all. :-/
     if '/company/co' in href and href.startswith('http://pro.'):
         rpid = self.re_imdbID.findall(href)
         if rpid and self._name:
             n = self._name.strip()
             pid = str(rpid[-1])
             d = analyze_company_name(n)
             res = [(pid, d)]
             self.reset()
             self._result = res
Пример #12
0
 def get_company_main(self, companyID):
     name = getCompanyName(companyID,
                             '%scompanies.index' % self.__db,
                             '%scompanies.data' % self.__db)
     if not name:
         raise IMDbDataAccessError, \
                         'unable to get companyID "%s"' % companyID
     res = analyze_company_name(name)
     filmography = getCompanyFilmography(companyID,
                                         '%scompanies.index' % self.__db,
                                         '%scompanies.data' % self.__db,
                                         '%stitles.index' % self.__db,
                                         '%stitles.key' % self.__db)
     if filmography:
         res.update(filmography)
     return {'data': res}
Пример #13
0
 def set_name(self, name):
     """Set the name of the company."""
     # Company diverges a bit from other classes, being able
     # to directly handle its "notes".  AND THAT'S PROBABLY A BAD IDEA!
     oname = name = name.strip()
     notes = ''
     if name.endswith(')'):
         fparidx = name.find('(')
         if fparidx != -1:
             notes = name[fparidx:]
             name = name[:fparidx].rstrip()
     if self.notes:
         name = oname
     d = analyze_company_name(name)
     self.data.update(d)
     if notes and not self.notes:
         self.notes = notes
Пример #14
0
 def set_name(self, name):
     """Set the name of the company."""
     # Company diverges a bit from other classes, being able
     # to directly handle its "notes".  AND THAT'S PROBABLY A BAD IDEA!
     oname = name = name.strip()
     notes = ''
     if name.endswith(')'):
         fparidx = name.find('(')
         if fparidx != -1:
             notes = name[fparidx:]
             name = name[:fparidx].rstrip()
     if self.notes:
         name = oname
     d = analyze_company_name(name)
     self.data.update(d)
     if notes and not self.notes:
         self.notes = notes
Пример #15
0
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
    """A parser for the company search page."""

    rules = [
        Rule(key='data',
             extractor=Rules(
                 foreach='//td[@class="result_text"]',
                 rules=[
                     Rule(key='link',
                          extractor=Path('./a/@href', reduce=reducers.first)),
                     Rule(key='name', extractor=Path('./a/text()')),
                     Rule(key='notes', extractor=Path('./text()'))
                 ],
                 transform=lambda x:
                 (analyze_imdbid(x.get('link')),
                  analyze_company_name(x.get('name') + x.get('notes', ''),
                                       stripNotes=True))))
    ]
Пример #16
0
class DOMHTMLSearchCompanyParser(DOMHTMLSearchMovieParser):
    _linkPrefix = '/company/co'

    rules = [
        Rule(
            key='data',
            extractor=Rules(
                foreach=
                '//td[@class="result_text"]/a[starts-with(@href, "/company/co")]/..',
                rules=[
                    Rule(key='link', extractor=Path('./a[1]/@href')),
                    Rule(key='name', extractor=Path('./a[1]/text()')),
                    Rule(key='notes', extractor=Path('./text()[1]'))
                ],
                transform=lambda x:
                (analyze_imdbid(x.get('link')),
                 analyze_company_name(x.get('name') + (x.get('notes') or ''),
                                      stripNotes=True))))
    ]
Пример #17
0
    from imdb.parser.common.cutils import search_company_name

    def _scan_company_names(keyFile, name1, results=0):
        """Scan the given file, using the cutils.search_company_name
        C function, for a given name."""
        name1 = name1.encode('latin_1', 'replace')
        try:
            st = search_company_name(keyFile, name1, results)
        except IOError, e:
            import warnings
            warnings.warn('unable to access companies information; '
                    'please run the companies4local.py script: %s.' % e)
            return []
        res = []
        for x in st:
            tmpd = analyze_company_name(latin2utf(x[2]))
            res.append((x[0], (x[1], tmpd)))
        return res
except ImportError:
    import warnings
    warnings.warn('Unable to import the cutils.search_company_name function.'
                    '  Searching company names using the "local" data access'
                    ' system will be a bit slower.')

    from imdb.parser.common.locsql import scan_company_names

    def _readCompanyNamsKeyFile(keyFile):
        """Iterate over the given file, returning tuples suited for
        the common.locsql.scan_company_names function."""
        try: kf = open(keyFile, 'r')
        except IOError, e: raise IMDbDataAccessError, str(e)
Пример #18
0
    from imdb.parser.common.cutils import search_company_name

    def _scan_company_names(keyFile, name1, results=0):
        """Scan the given file, using the cutils.search_company_name
        C function, for a given name."""
        name1 = name1.encode('latin_1', 'replace')
        try:
            st = search_company_name(keyFile, name1, results)
        except IOError, e:
            import warnings
            warnings.warn('unable to access companies information; '
                    'please run the companies4local.py script: %s.' % e)
            return []
        res = []
        for x in st:
            tmpd = analyze_company_name(latin2utf(x[2]))
            res.append((x[0], (x[1], tmpd)))
        return res
except ImportError:
    import warnings
    warnings.warn('Unable to import the cutils.search_company_name function.'
                    '  Searching company names using the "local" data access'
                    ' system will be a bit slower.')

    from imdb.parser.common.locsql import scan_company_names

    def _readCompanyNamsKeyFile(keyFile):
        """Iterate over the given file, returning tuples suited for
        the common.locsql.scan_company_names function."""
        try: kf = open(keyFile, 'r')
        except IOError, e: raise IMDbDataAccessError, str(e)
Пример #19
0
 def get_data(self):
     self._title = self._title.strip()
     if self._title:
         self._data.update(analyze_company_name(self._title))
     return self._data
Пример #20
0
def _analyze_company_name(n, *args, **kwds):
    """analyze_company_name doesn't accept the 'canonical' paramter."""
    return analyze_company_name(n, stripNotes=True)
Пример #21
0
def _analyze_company_name(n, *args, **kwds):
    """analyze_company_name doesn't accept the 'canonical' paramter."""
    return analyze_company_name(n, stripNotes=True)
Пример #22
0
 def get_data(self):
     self._title = self._title.strip()
     if self._title:
         self._data.update(analyze_company_name(self._title))
     return self._data