def create_index(dbfile, dbdir, debug=False): """Index the movie list for searching.""" # Load ratings; number of ratings included in index for score weighting ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search() # Count word frequencies while outputting searchable list frequencies = Counter() #indexfh = ChunkedFile(dbfile, 'index', mode='a') indexfh = open_compressed(dbfile+'.idx', mode='w') # Index all IMDb titles skipped = 0 for iterator in \ (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(), parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()): last_time = None for obj in iterator: if len(obj) == 1: # movies.list.gz data = parsers.parse_title(obj[0]) akafor = '' else: # aka-titles.list.gz data = parsers.parse_title(obj[1]) # AKA name of the title akafor = obj[0] # Real name of the title # If it's a duplicate AKA (for indexing purposes), skip it. # The same AKA title may be repeated. For example: # (aka Die Hard 4.0 (2007)) (UK) # (aka Die Hard 4.0 (2007)) (Germany) if last_time and last_time[0:2] == obj[0:2]: skipped += 1 continue last_time = obj searchable = _clean_word(data.name).split(' ') # Save word frequencies frequencies.update(searchable) # Determine rating for result ranking nratings = 0 if akafor and akafor in ratings: nratings = ratings[akafor].nratings elif not akafor and data.title in ratings: nratings = ratings[data.title].nratings # Write movie to output indexfh.write("\t".join((''.join(searchable), data.year.encode('ascii') if data.year else '', data.title.encode('utf-8'), akafor.encode('utf-8'), str(nratings)))) indexfh.write("\n") indexfh.close() #print "Skipped %d duplicate AKA titles" % skipped # Write frequencies to stopwords file if False: swf = ChunkedFile(dbfile, 'stopwords', mode='a') for word, numtimes in frequencies.most_common(): swf.write("%s %d\n" % (word, numtimes)) swf.close()
def _find_seeks_index(dbfile, indexname, queries, debug=False): """Use the index file to find exact seek positions for relevant records. End locations are not necessary since we are guaranteed that the data will be present, so a number of occurances is sufficient for prompt termination.""" timer = Timer(rl_min_dur=1) locs = Counter() # if debug: # # print " Searching index..." # print "" indexfh = ChunkedFile(dbfile, indexname, mode='r') last_bookmark = 0 for query in sorted(queries): # Use bookmarks to rapidly search the index! bookmark = indexfh.find_bookmark(query.encode('utf-8')) if bookmark != last_bookmark: indexfh.seek(bookmark) #print " Seek to", bookmark last_bookmark = bookmark for i, line in enumerate(indexfh): title, nums = line.decode('utf-8').split('\t') if i % 100 == 0: timer.step() if title in queries: locs.update(int(x) for x in nums.split(' ')) elif title > query: break # This works because the index is sorted. indexfh.close() for start, nresults in sorted(locs.items()): yield (start, None, nresults)
def _find_seeks_index(dbfile, indexname, queries, debug=False): """Use the index file to find exact seek positions for relevant records. End locations are not necessary since we are guaranteed that the data will be present, so a number of occurances is sufficient for prompt termination.""" timer = Timer(rl_min_dur=1) locs = Counter() if debug: print " Searching index..." indexfh = ChunkedFile(dbfile, indexname, mode='r') last_bookmark = 0 for query in sorted(queries): # Use bookmarks to rapidly search the index! bookmark = indexfh.find_bookmark(query.encode('utf-8')) if bookmark != last_bookmark: indexfh.seek(bookmark) #print " Seek to", bookmark last_bookmark = bookmark for i, line in enumerate(indexfh): title, nums = line.decode('utf-8').split('\t') if i % 100 == 0: timer.step() if title in queries: locs.update(int(x) for x in nums.split(' ')) elif title > query: break # This works because the index is sorted. indexfh.close() for start, nresults in sorted(locs.items()): yield (start, None, nresults) if debug: print ' Completed in', timer, 'seconds.'
def create_index(dbfile, dbdir, debug=False): """Index the movie list for searching.""" # Load ratings; number of ratings included in index for score weighting ratings = parsers.IMDbRatingParser(dbfile=dbfile, debug=debug).search() # Count word frequencies while outputting searchable list frequencies = Counter() #indexfh = ChunkedFile(dbfile, 'index', mode='a') indexfh = open_compressed(dbfile + '.idx', mode='w') # Index all IMDb titles skipped = 0 for iterator in \ (parsers.IMDbMoviesParser(dbfile=None, dbdir=dbdir).search(), parsers.IMDbAkaParser(dbfile=None, dbdir=dbdir).search()): last_time = None for obj in iterator: if len(obj) == 1: # movies.list.gz data = parsers.parse_title(obj[0]) akafor = '' else: # aka-titles.list.gz data = parsers.parse_title(obj[1]) # AKA name of the title akafor = obj[0] # Real name of the title # If it's a duplicate AKA (for indexing purposes), skip it. # The same AKA title may be repeated. For example: # (aka Die Hard 4.0 (2007)) (UK) # (aka Die Hard 4.0 (2007)) (Germany) if last_time and last_time[0:2] == obj[0:2]: skipped += 1 continue last_time = obj searchable = _clean_word(data.name).split(' ') # Save word frequencies frequencies.update(searchable) # Determine rating for result ranking nratings = 0 if akafor and akafor in ratings: nratings = ratings[akafor].nratings elif not akafor and data.title in ratings: nratings = ratings[data.title].nratings # Write movie to output indexfh.write("\t".join( (''.join(searchable), data.year.encode('ascii') if data.year else '', data.title.encode('utf-8'), akafor.encode('utf-8'), str(nratings)))) indexfh.write("\n") indexfh.close() #print "Skipped %d duplicate AKA titles" % skipped # Write frequencies to stopwords file if False: swf = ChunkedFile(dbfile, 'stopwords', mode='a') for word, numtimes in frequencies.most_common(): swf.write("%s %d\n" % (word, numtimes)) swf.close()
def _run_search(self, queries): """Return items from the data file matching any item in queries.""" if queries is not None: queries = set(queries) # Don't do anything if an empty set is provided if not queries: return # Open the compressed database, either copied version or original file. if self.dbfile: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') else: assert(len(self.origfiles) == 1) try: fileobj = open_compressed(self.origfiles[0]) except IOError as e: print "Skipping %s: %s" % (self.origfiles[0], e.strerror) return self._skip_header(fileobj) if self.debug: print "Reading %s..." % self.listname # Locate seek positions for all queries if queries and self.indexname: # Use index locs = list(_find_seeks_index(self.dbfile, self.indexname, queries, debug=self.debug)) elif queries: # Use bookmarks locs = list(_find_seeks_bookmarks(fileobj, queries, debug=self.debug)) else: locs = [(None, None, 1)] # Dummy values to start loop # Read selected lines from the file timer = Timer() loc = 0 for startloc, endloc, nresults in locs: # Skip to the correct position in the file if queries: if startloc > loc: #print " Seek to", startloc fileobj.seek(startloc) loc = fileobj.tell() elif startloc < loc: #print " Skipping", startloc, "already at", loc continue #else: # print " Skipping", startloc, "already there" #print " Finish at", endloc, "after", nresults, "results" for _ in xrange(nresults): # Parse the file until we get a result for i, line in enumerate(fileobj): # Determine if we have reached the end location for this # section if endloc and loc == endloc: break #assert(not endloc or loc < endloc) # Do not index video games or individual TV episodes # (Not applicable for all file types) if not self.dbfile and self.skip_tvvg and \ ('(VG)' in line or '{' in line): #loc = fileobj.tell() # Don't seek/tell in gzip continue # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') if queries and i % 100 == 0: timer.step() data = self._parse_line(line, loc) if self.dbfile: loc = fileobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Check if one of our queries matches if queries is None or data[0] in queries: yield self._make_result(data) if queries is not None: # queries.remove(data[0]) break if self.debug: print 'Completed in', timer, 'seconds.' fileobj.close()
def rebuild_index(self, do_copy=True): """Create an index for this file, to allow rapid seeking to information about a given title.""" if do_copy: copy_to = ChunkedFile(self.dbfile, self.listname, mode='a', autoflush=True if self.indexname else False) tellobj = copy_to filenames = self.origfiles else: #filenames = ??? copy_to = None raise NotImplementedError indexobj = defaultdict(list) for filename in filenames: if do_copy: try: fileobj = open_compressed(filename) except IOError as e: print " Skipping %s: %s" % (filename, e.strerror) continue else: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') tellobj = fileobj self._skip_header(fileobj) # Get location of this line loc = tellobj.tell() for line in fileobj: # Do not index video games or individual TV episodes # (Not applicable for all file types) if self.skip_tvvg and ('(VG)' in line or '{' in line): continue if copy_to: copy_to.write(line) # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') data = self._parse_line(line, loc) loc = tellobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Add to the index title, idxline = data[0:2] #self._make_locator(data) title = title.encode('utf-8') if self.indexname: indexobj[title].append(idxline) elif copy_to: copy_to.bookmark(title) fileobj.close() if copy_to: copy_to.close() if self.indexname: # Write out a separate index, if required (e.g. names databases) indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a', autoflush=False) for title, linenos in sorted(indexobj.items()): indexfh.write(title) indexfh.write("\t") indexfh.write(' '.join(str(i) for i in linenos)) indexfh.write("\n") indexfh.bookmark(title) indexfh.close() else: # An index is required to use more than one file, since the # resulting combination will not be sorted assert(len(filenames) == 1)
def _run_search(self, queries): """Return items from the data file matching any item in queries.""" if queries is not None: queries = set(queries) # Don't do anything if an empty set is provided if not queries: return # Open the compressed database, either copied version or original file. if self.dbfile: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') else: assert (len(self.origfiles) == 1) try: fileobj = open_compressed(self.origfiles[0]) except IOError as e: print "Skipping %s: %s" % (self.origfiles[0], e.strerror) return self._skip_header(fileobj) # if self.debug: # print "Reading %s..." % self.listname # Locate seek positions for all queries if queries and self.indexname: # Use index locs = list( _find_seeks_index(self.dbfile, self.indexname, queries, debug=self.debug)) elif queries: # Use bookmarks locs = list( _find_seeks_bookmarks(fileobj, queries, debug=self.debug)) else: locs = [(None, None, 1)] # Dummy values to start loop # Read selected lines from the file timer = Timer() loc = 0 for startloc, endloc, nresults in locs: # Skip to the correct position in the file if queries: if startloc > loc: #print " Seek to", startloc fileobj.seek(startloc) loc = fileobj.tell() elif startloc < loc: #print " Skipping", startloc, "already at", loc continue #else: # print " Skipping", startloc, "already there" #print " Finish at", endloc, "after", nresults, "results" for _ in xrange(nresults): # Parse the file until we get a result for i, line in enumerate(fileobj): # Determine if we have reached the end location for this # section if endloc and loc == endloc: break #assert(not endloc or loc < endloc) # Do not index video games or individual TV episodes # (Not applicable for all file types) if not self.dbfile and self.skip_tvvg and \ ('(VG)' in line or '{' in line): #loc = fileobj.tell() # Don't seek/tell in gzip continue # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') if queries and i % 100 == 0: timer.step() data = self._parse_line(line, loc) if self.dbfile: loc = fileobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Check if one of our queries matches if queries is None or data[0] in queries: yield self._make_result(data) if queries is not None: # queries.remove(data[0]) break # if self.debug: # print 'Completed in', timer, 'seconds.' fileobj.close()
def rebuild_index(self, do_copy=True): """Create an index for this file, to allow rapid seeking to information about a given title.""" if do_copy: copy_to = ChunkedFile(self.dbfile, self.listname, mode='a', autoflush=True if self.indexname else False) tellobj = copy_to filenames = self.origfiles else: #filenames = ??? copy_to = None raise NotImplementedError indexobj = defaultdict(list) for filename in filenames: if do_copy: try: fileobj = open_compressed(filename) except IOError as e: print " Skipping %s: %s" % (filename, e.strerror) continue else: fileobj = ChunkedFile(self.dbfile, self.listname, mode='r') tellobj = fileobj self._skip_header(fileobj) # Get location of this line loc = tellobj.tell() for line in fileobj: # Do not index video games or individual TV episodes # (Not applicable for all file types) if self.skip_tvvg and ('(VG)' in line or '{' in line): continue if copy_to: copy_to.write(line) # Decode database (IMDb databases use ISO-8859-1) line = line.rstrip().decode('iso-8859-1') data = self._parse_line(line, loc) loc = tellobj.tell() if data is None: break # End of database if not data: continue # Skip this line # Add to the index title, idxline = data[0:2] #self._make_locator(data) title = title.encode('utf-8') if self.indexname: indexobj[title].append(idxline) elif copy_to: copy_to.bookmark(title) fileobj.close() if copy_to: copy_to.close() if self.indexname: # Write out a separate index, if required (e.g. names databases) indexfh = ChunkedFile(self.dbfile, self.indexname, mode='a', autoflush=False) for title, linenos in sorted(indexobj.items()): indexfh.write(title) indexfh.write("\t") indexfh.write(' '.join(str(i) for i in linenos)) indexfh.write("\n") indexfh.bookmark(title) indexfh.close() else: # An index is required to use more than one file, since the # resulting combination will not be sorted assert (len(filenames) == 1)