def similar_terms(self, query, filter=None, limit=20): """ Get a Series of terms ranked by their similarity to the query. The query can be: - A DataFrame of weighted terms - A dictionary from terms to weights - A list of (term, weight) tuples - A single term If the query contains 5 or fewer terms, it will be expanded to include neighboring terms in ConceptNet. TODO: is this sometimes returning a DataFrame? Should it accept a Series as well as a DataFrame? """ self.load() vec = self.get_vector(query) small_vec = vec[:self.small_k] search_frame = self.small_frame if filter: exact_only = filter.count('/') >= 3 # TODO: Is this duplicating something that field_match was supposed # to do? if filter.endswith('/.'): filter = filter[:-2] exact_only = True if exact_only: if filter in search_frame.index: idx = search_frame.index.get_loc(filter) search_frame = search_frame[idx:idx + 1] else: search_frame = search_frame.iloc[0:0] else: start_key = filter # '0' is the character after '/', so end_key is the first possible # key that's not a descendant of the given filter key end_key = filter + '0' try: start_idx = search_frame.index.get_loc(start_key, method='bfill') except KeyError: start_idx = len(search_frame.index) try: end_idx = search_frame.index.get_loc(end_key, method='bfill') except KeyError: end_idx = len(search_frame.index) search_frame = search_frame.iloc[start_idx:end_idx] similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50) similar_choices = l2_normalize_rows( self.frame.loc[similar_sloppy.index].astype('f')) similar = similar_to_vec(similar_choices, vec, limit=limit) return similar
def eval_analogies(frame): filename = get_support_data_filename('google-analogies/questions-words.txt') quads = read_google_analogies(filename) vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', 200000) ] wrap = VectorSpaceWrapper(frame=frame) vecs = np.vstack([wrap.get_vector(word) for word in vocab]) tframe = pd.DataFrame(vecs, index=vocab) total = 0 correct = 0 seen_mistakes = set() for quad in quads: prompt = quad[:3] answer = quad[3] vector = analogy_func(frame, *prompt) similar = similar_to_vec(tframe, vector) result = None for match in similar.index: if match not in prompt: result = match break if result == answer: correct += 1 else: if result not in seen_mistakes: print( "%s : %s :: %s : [%s] (should be %s)" % (quad[0], quad[1], quad[2], result, answer) ) seen_mistakes.add(result) total += 1 low, high = proportion_confint(correct, total) return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
def similar_terms(self, query, filter=None, limit=20): """ Get a DataFrame of terms ranked by their similarity to the query. The query can be: - A DataFrame of weighted terms - A dictionary from terms to weights - A list of (term, weight) tuples - A single term If the query contains 5 or fewer terms, it will be expanded to include neighboring terms in ConceptNet. """ self.load() vec = self.get_vector(query) small_vec = vec[: self.small_k] search_frame = self.small_frame if filter: exact_only = filter.count("/") >= 3 if filter.endswith("/."): filter = filter[:-2] exact_only = True if exact_only: if filter in search_frame.index: idx = search_frame.index.get_loc(filter) search_frame = search_frame[idx : idx + 1] else: search_frame = search_frame.iloc[0:0] else: start_key = filter # '0' is the character after '/', so end_key is the first possible # key that's not a descendant of the given filter key end_key = filter + "0" try: start_idx = search_frame.index.get_loc(start_key, method="bfill") except KeyError: start_idx = len(search_frame.index) try: end_idx = search_frame.index.get_loc(end_key, method="bfill") except KeyError: end_idx = len(search_frame.index) search_frame = search_frame.iloc[start_idx:end_idx] similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50) similar_choices = l2_normalize_rows(self.frame.loc[similar_sloppy.index].astype("f")) similar = similar_to_vec(similar_choices, vec, limit=limit) return similar
def similar_terms(self, query, filter=None, limit=20): """ Get a Series of terms ranked by their similarity to the query. The query can be: - A pandas Series of weighted terms - A pandas DataFrame of weighted terms - A dictionary from terms to weights - A list of (term, weight) tuples - A single term - An existing vector If the query contains 5 or fewer terms, it will be expanded using the out-of-vocab strategy. """ self.load() vec = self.get_vector(query) small_vec = vec[:self.small_k] search_frame = self.small_frame # TODO: document filter if filter: exact_only = filter.count('/') >= 3 if filter.endswith('/.'): filter = filter[:-2] exact_only = True if exact_only: if filter in search_frame.index: idx = search_frame.index.get_loc(filter) search_frame = search_frame[idx:idx + 1] else: search_frame = search_frame.iloc[0:0] else: start_idx, end_idx = self._index_prefix_range(filter + '/') search_frame = search_frame.iloc[start_idx:end_idx] similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50) similar_choices = l2_normalize_rows( self.frame.loc[similar_sloppy.index].astype('f')) similar = similar_to_vec(similar_choices, vec, limit=limit) return similar
def similar_terms(self, query, filter=None, limit=20): """ Get a Series of terms ranked by their similarity to the query. The query can be: - A pandas Series of weighted terms - A pandas DataFrame of weighted terms - A dictionary from terms to weights - A list of (term, weight) tuples - A single term - An existing vector If the query contains 5 or fewer terms, it will be expanded to include neighboring terms in ConceptNet. """ self.load() vec = self.get_vector(query) small_vec = vec[: self.small_k] search_frame = self.small_frame # TODO: document filter if filter: exact_only = filter.count('/') >= 3 if filter.endswith('/.'): filter = filter[:-2] exact_only = True if exact_only: if filter in search_frame.index: idx = search_frame.index.get_loc(filter) search_frame = search_frame[idx : idx + 1] else: search_frame = search_frame.iloc[0:0] else: start_idx, end_idx = self._index_prefix_range(filter + '/') search_frame = search_frame.iloc[start_idx:end_idx] similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50) similar_choices = l2_normalize_rows( self.frame.loc[similar_sloppy.index].astype('f') ) similar = similar_to_vec(similar_choices, vec, limit=limit) return similar
def make_replacements(small_frame, big_frame): """ Create a replacements dictionary to map terms only present in a big frame to the closest term in a small_frame. This method uses a brute-force solution. """ intersected = big_frame.loc[small_frame.index].dropna() replacements = {} for term in big_frame.index: if term not in small_frame.index: most_similar = similar_to_vec(intersected, big_frame.loc[term], limit=1) got = list(most_similar.index) if got: replacements[term] = got[0] return replacements