def cache_simstring(datasets, verbose=False, ann_modulo=1000, queries_modulo=1000): from simstring import reader as simstring_reader if verbose: print >> stderr, 'Caching SimString:' print >> stderr, 'Pre-caching queries...', queries_seen = 0 # For most cases we are better off caching every single query instead of # iterating over them, this also makes sure that each query is unique when # we finally hit the SimString database queries = set() for dataset in datasets: for document in dataset: for sentence in document: for annotation in sentence: queries.add(sentence.annotation_text(annotation)) if verbose: queries_seen += 1 if queries_seen % queries_modulo == 0: print >> stderr, queries_seen, '...', if verbose: print >> stderr, ('Done! (reduced from {} to {})').format( queries_seen, len(queries)) for db_i, db_path in enumerate(SIMSTRING_DB_PATHS, start=1): if verbose: print >> stderr, 'Caching for db: {0} ({1}/{2}) ...'.format( db_path, db_i, len(SIMSTRING_DB_PATHS)), if verbose: ann_cnt = 0 db_reader = None try: db_reader = simstring_reader(db_path) for query in queries: query_simstring_db(query, db_path, reader_arg=db_reader) if verbose: ann_cnt += 1 if ann_cnt % ann_modulo == 0: print >> stderr, ann_cnt, '...', finally: if db_reader is not None: db_reader.close() if verbose: print >> stderr, 'Done!'
def _token_reprs(tokens, db_paths, verbose=False): from simstring import cosine as simstring_cosine from simstring import reader as simstring_reader if verbose: db_timings = [] repr_by_token = defaultdict(list) for db_path_i, db_path in enumerate(db_paths, start=1): if verbose: db_timing_start = datetime.utcnow() _vprint('Opening DB ({}/{}): {}'.format(db_path_i, len(db_paths), db_path)) reader = simstring_reader(db_path) reader.measure = simstring_cosine if verbose: _vprint('Querying DB...', end='') for token_i, token in enumerate(tokens, start=1): if verbose and token_i % 1000 == 0: _vprint('{}...'.format(token_i), no_tag=True, end='') # Fech the threshold to use as a distance repr_by_token[token].append(_find_threshold(token.encode('utf-8'), reader)) if verbose: _vprint('{}...Done!'.format(token_i), no_tag=True) db_timings.append(datetime.utcnow() - db_timing_start) if db_path_i != len(db_paths): per_db_estimate = (sum(db_timings, timedelta()) / len(db_timings)) completion_estimate = (per_db_estimate * (len(db_paths) - db_path_i)) _vprint('Estimated time until completion: {}'.format( completion_estimate)) for token in tokens: yield token, repr_by_token[token]
def query_simstring_db(query, db_path, reader_arg=None): from simstring import reader as simstring_reader from simstring import cosine as simstring_cosine global SIMSTRING_QUERY_CACHE global MODIFIED_SIMSTRING_QUERY_CACHE if SIMSTRING_QUERY_CACHE is None: _load_simstring_cache() try: cache = SIMSTRING_QUERY_CACHE[db_path] except KeyError: cache = {} SIMSTRING_QUERY_CACHE[db_path] = cache MODIFIED_SIMSTRING_QUERY_CACHE = True try: return cache[query] except KeyError: MODIFIED_SIMSTRING_QUERY_CACHE = True # We have to query this... #assert False, 'NOT ALLOWED TO QUERY!' if reader_arg is None: reader = None try: if reader_arg is None: reader = simstring_reader(db_path) else: reader = reader_arg reader.measure = simstring_cosine for threshold in (v / 10.0 for v in xrange(10, QUERY_CUT_OFF - 1, -1)): reader.threshold = threshold # The reader will choke on unicode objects, so encode it query_utf8 = query.encode('utf-8') response = reader.retrieve(query_utf8) if not TSURUOKA_DIST: # Only save whether we got a response or not if response: response = True else: response = False tsuruoka_dist = None else: # Okay, now we are in a pickle, SimString has returned # everything sorted by length... Although it had it internally # by n-gram. *sigh* We need it by n-gram. if response: # Sort the response to prepare a cut-off from lib.ngram import n_gram_ref_cos_cmp, n_gram_gen ref_grams = set(g for g in n_gram_gen(query, n=3, guards=TSURUOKA_GUARDED)) # We need Unicode internally at this point response = [s.decode('utf-8') for s in response] response = sorted(response, cmp=lambda a, b: -n_gram_ref_cos_cmp( a, b, ref_grams, guards=TSURUOKA_GUARDED)) # Cut-off time! response = response[:RESPONSE_CUT_OFF] if TSURUOKA_NORMALISED: tsuruoka_dist = max(bucket_norm_tsuruoka(query, resp_str) for resp_str in response) else: tsuruoka_dist = min(bucket_tsuruoka(query, resp_str) for resp_str in response) if response: cache[query] = (threshold, tsuruoka_dist) # We can and should bail at this point break else: # We found no results for any threshold cache[query] = (None, None) finally: # Only close if we were not passed the reader if reader_arg is None and reader is not None: reader.close() #print cache #print SIMSTRING_CACHE_BY_DB[db_path] #print SIMSTRING_CACHE_BY_DB #if len(cache) > 100: # exit(-1) return cache[query]
def query_simstring_db(query, db_path, reader_arg=None): from simstring import reader as simstring_reader from simstring import cosine as simstring_cosine global SIMSTRING_QUERY_CACHE global MODIFIED_SIMSTRING_QUERY_CACHE if SIMSTRING_QUERY_CACHE is None: _load_simstring_cache() try: cache = SIMSTRING_QUERY_CACHE[db_path] except KeyError: cache = {} SIMSTRING_QUERY_CACHE[db_path] = cache MODIFIED_SIMSTRING_QUERY_CACHE = True try: return cache[query] except KeyError: MODIFIED_SIMSTRING_QUERY_CACHE = True # We have to query this... #assert False, 'NOT ALLOWED TO QUERY!' if reader_arg is None: reader = None try: if reader_arg is None: reader = simstring_reader(db_path) else: reader = reader_arg reader.measure = simstring_cosine for threshold in (v / 10.0 for v in xrange(10, QUERY_CUT_OFF - 1, -1)): reader.threshold = threshold # The reader will choke on unicode objects, so encode it query_utf8 = query.encode('utf-8') response = reader.retrieve(query_utf8) if not TSURUOKA_DIST: # Only save whether we got a response or not if response: response = True else: response = False tsuruoka_dist = None else: # Okay, now we are in a pickle, SimString has returned # everything sorted by length... Although it had it internally # by n-gram. *sigh* We need it by n-gram. if response: # Sort the response to prepare a cut-off from lib.ngram import n_gram_ref_cos_cmp, n_gram_gen ref_grams = set(g for g in n_gram_gen( query, n=3, guards=TSURUOKA_GUARDED)) # We need Unicode internally at this point response = [s.decode('utf-8') for s in response] response = sorted( response, cmp=lambda a, b: -n_gram_ref_cos_cmp( a, b, ref_grams, guards=TSURUOKA_GUARDED)) # Cut-off time! response = response[:RESPONSE_CUT_OFF] if TSURUOKA_NORMALISED: tsuruoka_dist = max( bucket_norm_tsuruoka(query, resp_str) for resp_str in response) else: tsuruoka_dist = min( bucket_tsuruoka(query, resp_str) for resp_str in response) if response: cache[query] = (threshold, tsuruoka_dist) # We can and should bail at this point break else: # We found no results for any threshold cache[query] = (None, None) finally: # Only close if we were not passed the reader if reader_arg is None and reader is not None: reader.close() #print cache #print SIMSTRING_CACHE_BY_DB[db_path] #print SIMSTRING_CACHE_BY_DB #if len(cache) > 100: # exit(-1) return cache[query]