示例#1
0
def create_index(dbfile, dbdir, debug=False):
    """Index the movie list for searching."""
    # Load ratings; number of ratings included in index for score weighting
    ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search()

    # Count word frequencies while outputting searchable list
    frequencies = Counter()
    #indexfh = ChunkedFile(dbfile, 'index', mode='a')
    indexfh = open_compressed(dbfile+'.idx', mode='w')

    # Index all IMDb titles
    skipped = 0
    for iterator in \
            (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(),
             parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()):
        last_time = None
        for obj in iterator:
            if len(obj) == 1:   # movies.list.gz
                data = parsers.parse_title(obj[0])
                akafor = ''
            else:               # aka-titles.list.gz
                data = parsers.parse_title(obj[1])  # AKA name of the title
                akafor = obj[0]             # Real name of the title
                # If it's a duplicate AKA (for indexing purposes), skip it.
                # The same AKA title may be repeated. For example:
                #     (aka Die Hard 4.0 (2007)) (UK)
                #     (aka Die Hard 4.0 (2007)) (Germany)
                if last_time and last_time[0:2] == obj[0:2]:
                    skipped += 1
                    continue
                last_time = obj
            searchable = _clean_word(data.name).split(' ')
            # Save word frequencies
            frequencies.update(searchable)
            # Determine rating for result ranking
            nratings = 0
            if akafor and akafor in ratings:
                nratings = ratings[akafor].nratings
            elif not akafor and data.title in ratings:
                nratings = ratings[data.title].nratings
            # Write movie to output
            indexfh.write("\t".join((''.join(searchable),
                                     data.year.encode('ascii')
                                     if data.year else '',
                                     data.title.encode('utf-8'),
                                     akafor.encode('utf-8'),
                                     str(nratings))))
            indexfh.write("\n")
    indexfh.close()
    #print "Skipped %d duplicate AKA titles" % skipped

    # Write frequencies to stopwords file
    if False:
        swf = ChunkedFile(dbfile, 'stopwords', mode='a')
        for word, numtimes in frequencies.most_common():
            swf.write("%s %d\n" % (word, numtimes))
        swf.close()
示例#2
0
def create_index(dbfile, dbdir, debug=False):
    """Index the movie list for searching."""
    # Load ratings; number of ratings included in index for score weighting
    ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search()

    # Count word frequencies while outputting searchable list
    frequencies = Counter()
    #indexfh = ChunkedFile(dbfile, 'index', mode='a')
    indexfh = open_compressed(dbfile + '.idx', mode='w')

    # Index all IMDb titles
    skipped = 0
    for iterator in \
            (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(),
             parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()):
        last_time = None
        for obj in iterator:
            if len(obj) == 1:  # movies.list.gz
                data = parsers.parse_title(obj[0])
                akafor = ''
            else:  # aka-titles.list.gz
                data = parsers.parse_title(obj[1])  # AKA name of the title
                akafor = obj[0]  # Real name of the title
                # If it's a duplicate AKA (for indexing purposes), skip it.
                # The same AKA title may be repeated. For example:
                #     (aka Die Hard 4.0 (2007)) (UK)
                #     (aka Die Hard 4.0 (2007)) (Germany)
                if last_time and last_time[0:2] == obj[0:2]:
                    skipped += 1
                    continue
                last_time = obj
            searchable = _clean_word(data.name).split(' ')
            # Save word frequencies
            frequencies.update(searchable)
            # Determine rating for result ranking
            nratings = 0
            if akafor and akafor in ratings:
                nratings = ratings[akafor].nratings
            elif not akafor and data.title in ratings:
                nratings = ratings[data.title].nratings
            # Write movie to output
            indexfh.write("\t".join(
                (''.join(searchable),
                 data.year.encode('ascii') if data.year else '',
                 data.title.encode('utf-8'), akafor.encode('utf-8'),
                 str(nratings))))
            indexfh.write("\n")
    indexfh.close()
    #print "Skipped %d duplicate AKA titles" % skipped

    # Write frequencies to stopwords file
    if False:
        swf = ChunkedFile(dbfile, 'stopwords', mode='a')
        for word, numtimes in frequencies.most_common():
            swf.write("%s %d\n" % (word, numtimes))
        swf.close()
示例#3
0
    def rebuild_index(self, do_copy=True):
        """Create an index for this file, to allow rapid seeking to information
        about a given title."""
        if do_copy:
            copy_to = ChunkedFile(self.dbfile, self.listname, mode='a',
                                  autoflush=True if self.indexname else False)
            tellobj = copy_to
            filenames = self.origfiles
        else:
            #filenames = ???
            copy_to = None
            raise NotImplementedError

        indexobj = defaultdict(list)

        for filename in filenames:
            if do_copy:
                try:
                    fileobj = open_compressed(filename)
                except IOError as e:
                    print "  Skipping %s: %s" % (filename, e.strerror)
                    continue
            else:
                fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
                tellobj = fileobj

            self._skip_header(fileobj)
            # Get location of this line
            loc = tellobj.tell()
            for line in fileobj:
                # Do not index video games or individual TV episodes
                # (Not applicable for all file types)
                if self.skip_tvvg and ('(VG)' in line or '{' in line):
                    continue
                if copy_to:
                    copy_to.write(line)
                # Decode database (IMDb databases use ISO-8859-1)
                line = line.rstrip().decode('iso-8859-1')

                data = self._parse_line(line, loc)
                loc = tellobj.tell()
                if data is None:
                    break           # End of database
                if not data:
                    continue        # Skip this line

                # Add to the index
                title, idxline = data[0:2] #self._make_locator(data)
                title = title.encode('utf-8')
                if self.indexname:
                    indexobj[title].append(idxline)
                elif copy_to:
                    copy_to.bookmark(title)
            fileobj.close()
        if copy_to:
            copy_to.close()

        if self.indexname:
            # Write out a separate index, if required (e.g. names databases)
            indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a',
                                  autoflush=False)
            for title, linenos in sorted(indexobj.items()):
                indexfh.write(title)
                indexfh.write("\t")
                indexfh.write(' '.join(str(i) for i in linenos))
                indexfh.write("\n")
                indexfh.bookmark(title)
            indexfh.close()
        else:
            # An index is required to use more than one file, since the
            # resulting combination will not be sorted
            assert(len(filenames) == 1)
示例#4
0
    def rebuild_index(self, do_copy=True):
        """Create an index for this file, to allow rapid seeking to information
        about a given title."""
        if do_copy:
            copy_to = ChunkedFile(self.dbfile,
                                  self.listname,
                                  mode='a',
                                  autoflush=True if self.indexname else False)
            tellobj = copy_to
            filenames = self.origfiles
        else:
            #filenames = ???
            copy_to = None
            raise NotImplementedError

        indexobj = defaultdict(list)

        for filename in filenames:
            if do_copy:
                try:
                    fileobj = open_compressed(filename)
                except IOError as e:
                    print "  Skipping %s: %s" % (filename, e.strerror)
                    continue
            else:
                fileobj = ChunkedFile(self.dbfile, self.listname, mode='r')
                tellobj = fileobj

            self._skip_header(fileobj)
            # Get location of this line
            loc = tellobj.tell()
            for line in fileobj:
                # Do not index video games or individual TV episodes
                # (Not applicable for all file types)
                if self.skip_tvvg and ('(VG)' in line or '{' in line):
                    continue
                if copy_to:
                    copy_to.write(line)
                # Decode database (IMDb databases use ISO-8859-1)
                line = line.rstrip().decode('iso-8859-1')

                data = self._parse_line(line, loc)
                loc = tellobj.tell()
                if data is None:
                    break  # End of database
                if not data:
                    continue  # Skip this line

                # Add to the index
                title, idxline = data[0:2]  #self._make_locator(data)
                title = title.encode('utf-8')
                if self.indexname:
                    indexobj[title].append(idxline)
                elif copy_to:
                    copy_to.bookmark(title)
            fileobj.close()
        if copy_to:
            copy_to.close()

        if self.indexname:
            # Write out a separate index, if required (e.g. names databases)
            indexfh = ChunkedFile(self.dbfile,
                                  self.indexname,
                                  mode='a',
                                  autoflush=False)
            for title, linenos in sorted(indexobj.items()):
                indexfh.write(title)
                indexfh.write("\t")
                indexfh.write(' '.join(str(i) for i in linenos))
                indexfh.write("\n")
                indexfh.bookmark(title)
            indexfh.close()
        else:
            # An index is required to use more than one file, since the
            # resulting combination will not be sorted
            assert (len(filenames) == 1)