Exemplo n.º 1
0
def load_simple_concepts_and_expand(path_to_conceptlist="simpleconcepts",
                                    save=True,
                                    filename="simple_concept_relations",
                                    limit=100):
    edgelist = []
    af = AssertionFinder()
    offset = 0
    print("Opening the simple ocncept list")
    with open(path_to_conceptlist) as f:
        for line in f:
            if "#" in line:
                continue
            else:
                standard_uri = standardize_concept_uri("en", line.strip())
                res = af.lookup(standard_uri, limit=limit, offset=offset)
                for item in res:
                    if item["@type"] == "Edge":
                        edgelist.append(edge.from_edge_object(item))
    secondedgelist = []
    print("Following up on ends")
    print(len(edgelist))
    for idx, edgeobj in enumerate(edgelist):
        print(idx)
        standard_uri = standardize_concept_uri("en", edgeobj.end.strip())
        res = af.lookup(standard_uri, limit=limit, offset=offset)
        for item in res:
            if item["@type"] == "Edge":
                secondedgelist.append(edge.from_edge_object(item))
    edgelist = edgelist + secondedgelist
    print("dumping")

    if save:
        pickle.dump(edgelist, open(filename, 'wb'))
    return edgelist
Exemplo n.º 2
0
class VectorSpaceWrapper(object):
    """
    An object that wraps the data necessary to look up vectors for terms
    (in the vector space named Conceptnet Numberbatch) and find related terms.

    The filenames usually don't need to be specified, because the system will
    look in default locations for them. They can be specified to replace them
    with toy versions for testing, or to evaluate how other embeddings perform
    while still using ConceptNet for looking up words outside their vocabulary.
    """

    def __init__(self, vector_filename=None, frame=None, use_db=True):
        if frame is None:
            self.frame = None
            self.vector_filename = vector_filename or get_data_filename(
                'vectors/mini.h5'
            )
        else:
            self.frame = frame
            self.vector_filename = None
        self.small_frame = None
        self.k = None
        self.small_k = None
        self.finder = None
        self.trie = None
        self.cache = {}
        if use_db:
            self.finder = AssertionFinder()

    def load(self):
        """
        Ensure that all the data is loaded.
        """
        if self.small_frame is not None:
            return
        try:
            if self.frame is None:
                self.frame = load_hdf(self.vector_filename)

            if not self.frame.index[1].startswith('/c/'):
                # These terms weren't in ConceptNet standard form. Assume
                # they're in English, and stick the English language tag on
                # them without any further transformation, so we can be sure
                # we're evaluating the vectors as provided.
                self.finder = None
                self.frame.index = ['/c/en/' + label for label in self.frame.index]

            if not self.frame.index.is_monotonic_increasing:
                self.frame = self.frame.sort_index()

            self.k = self.frame.shape[1]
            self.small_k = 100
            self.small_frame = self.frame.iloc[:, : self.small_k].copy()
        except OSError:
            raise MissingVectorSpace(
                "Couldn't load the vector space %r. Do you need to build or "
                "download it?" % self.vector_filename
            )
        self._build_trie()

    def _build_trie(self):
        """
        Build a trie (a prefix tree) that allows finding terms by their
        prefixes.
        """
        self._trie = marisa_trie.Trie(list(self.frame.index))

    @staticmethod
    def passes_filter(label, filter):
        if filter is None:
            return True
        else:
            return field_match(label, filter)

    @staticmethod
    def _englishify(term):
        """
        Change the language of a /c/ term to English. If the input isn't a term,
        return None.
        """
        splits = split_uri(term)
        if not term.startswith('/c/'):
            return None
        if len(splits) > 2:
            englishified = '/c/en/' + splits[2]
            return englishified

    def _find_neighbors(self, term, limit_per_term, weight):
        neighbors = []
        for edge in self.finder.lookup(term, limit=limit_per_term):
            if field_match(edge['start']['term'], term) and not field_match(
                edge['end']['term'], term
            ):
                neighbor = edge['end']['term']
            elif field_match(edge['end']['term'], term) and not field_match(
                edge['start']['term'], term
            ):
                neighbor = edge['start']['term']
            else:
                continue
            neighbor_weight = weight * min(10, edge['weight']) * 0.01
            neighbors.append((neighbor, neighbor_weight))
        return neighbors

    def _match_prefix(self, term, prefix_weight):
        results = []
        while term:
            # Skip excessively general lookups, for either an entire
            # language, or all terms starting with a single
            # non-ideographic letter
            if (
                len(split_uri(term)) < 3
                or term.endswith('/')
                or (term[-2] == '/' and term[-1] < chr(0x3000))
            ):
                break
            prefixed = self._terms_with_prefix(term)
            if prefixed:
                n_prefixed = len(prefixed)
                for prefixed_term in prefixed:
                    results.append((prefixed_term, prefix_weight / n_prefixed))
                break
            term = term[:-1]
        return results

    def expand_terms(self, terms, limit_per_term=10, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if oov_vector and term not in self.frame.index and self.finder is not None:
                neighbors = self._find_neighbors(term, limit_per_term, weight)
                expanded.extend(neighbors)

                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = self._englishify(term)
                    if englishified is not None:
                        expanded.append((englishified, prefix_weight))

                prefix_matches = self._match_prefix(term, prefix_weight)
                expanded.extend(prefix_matches)

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [
                (uri_prefix(term), weight / total_weight) for (term, weight) in expanded
            ]

    def expanded_vector(self, terms, limit_per_term=10, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, make a vector
        representing information from:

        - The vectors for these terms
        - The vectors for their neighbors in ConceptNet
        - The vectors for terms that share a sufficiently-long prefix with
          any terms in this list that are out-of-vocabulary
        """
        self.load()
        return weighted_average(
            self.frame, self.expand_terms(terms, limit_per_term, oov_vector)
        )

    def text_to_vector(self, language, text):
        """
        Used in Story Cloze Test to create a vector for text.
        """
        tokens = wordfreq.tokenize(text, language)
        weighted_terms = [
            (uri_prefix(standardized_uri(language, token)), 1.) for token in tokens
        ]
        return self.get_vector(weighted_terms, oov_vector=False)

    def get_vector(self, query, oov_vector=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `oov_vector=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()

        if isinstance(query, np.ndarray):
            return query
        elif isinstance(query, pd.Series) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, pd.DataFrame):
            terms = list(query.to_records())
        elif isinstance(query, str):
            terms = [(query, 1.)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))

        cache_key = tuple(terms + [oov_vector])
        if cache_key in self.cache:
            return self.cache[cache_key]

        oov_vector = oov_vector and (len(terms) <= 5)

        vec = self.expanded_vector(terms, oov_vector=oov_vector)
        self.cache[cache_key] = normalize_vec(vec)
        return self.cache[cache_key]

    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A pandas Series of weighted terms
        - A pandas DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term
        - An existing vector

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[: self.small_k]
        search_frame = self.small_frame
        # TODO: document filter
        if filter:
            exact_only = filter.count('/') >= 3
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx : idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_idx, end_idx = self._index_prefix_range(filter + '/')
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f')
        )

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar

    def get_similarity(self, query1, query2):
        vec1 = self.get_vector(query1)
        vec2 = self.get_vector(query2)
        return cosine_similarity(vec1, vec2)

    def _terms_with_prefix(self, prefix):
        """
        Get a list of terms whose URI begins with the given prefix. The list
        will be in an arbitrary order.
        """
        return self._trie.keys(prefix)

    def _index_prefix_range(self, prefix):
        """
        Get the range of indices on the DataFrame we're wrapping that begin
        with a given prefix.

        The range is a pair of index numbers. Following the convention of
        Python ranges, the starting index is inclusive, while the end index
        is exclusive.

        Returns the empty range (0, 0) if no terms begin with this prefix.
        """
        # Use the trie to find all terms with the given prefix. Then sort them,
        # because the range will span from our first prefix in sorted
        # order to just after our last.
        terms = sorted(self._terms_with_prefix(prefix))
        if not terms:
            return (0, 0)

        start_loc = self.frame.index.get_loc(terms[0])
        end_loc = self.frame.index.get_loc(terms[-1]) + 1
        return start_loc, end_loc
Exemplo n.º 3
0
class VectorSpaceWrapper(object):
    """
    An object that wraps the data necessary to look up vectors for terms
    (in the vector space named Conceptnet Numberbatch) and find related terms.

    The filenames usually don't need to be specified, because the system will
    look in default locations for them. They can be specified to replace them
    with toy versions for testing.
    """
    def __init__(self, vector_filename=None, frame=None, use_db=True):
        if frame is None:
            self.frame = None
            self.vector_filename = vector_filename or get_data_filename(
                'vectors/mini.h5')
        else:
            self.frame = frame
            self.vector_filename = None
        self.small_frame = None
        self.k = None
        self.small_k = None
        self.finder = None
        self.standardized = None
        if use_db:
            self.finder = AssertionFinder()

    def load(self):
        """
        Ensure that all the data is loaded.
        """
        if self.small_frame is not None:
            return
        try:
            if self.frame is None:
                self.frame = load_hdf(self.vector_filename)

            if self.frame.index[0].startswith('/c/'):
                self.standardized = True
            else:
                # These terms weren't in ConceptNet standard form. Assume
                # they're in English, and stick the English language tag on
                # them without any further transformation, so we can be sure
                # we're evaluating the vectors as provided.
                self.standardized = False
                self.finder = None
                self.frame.index = [
                    '/c/en/' + label for label in self.frame.index
                ]

            self.k = self.frame.shape[1]
            self.small_k = 100
            self.small_frame = self.frame.iloc[:, :self.small_k].copy()
        except OSError:
            raise MissingVectorSpace(
                "Couldn't load the vector space %r. Do you need to build or "
                "download it?" % self.vector_filename)

    @staticmethod
    def passes_filter(label, filter):
        if filter is None:
            return True
        else:
            return field_match(label, filter)

    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors. This forms a reasonable
        approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            expanded.append((term, weight / 10))
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge['start']['term'],
                                   term) and not field_match(
                                       edge['end']['term'], term):
                        neighbor = edge['end']['term']
                    elif field_match(edge['end']['term'],
                                     term) and not field_match(
                                         edge['start']['term'], term):
                        neighbor = edge['start']['term']
                    else:
                        continue
                    neighbor_weight = weight * min(10, edge['weight']) * 0.001
                    expanded.append((neighbor, neighbor_weight))

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight)
                    for (term, weight) in expanded]

    def expanded_vector(self,
                        terms,
                        limit_per_term=10,
                        include_neighbors=True):
        self.load()
        return weighted_average(
            self.frame,
            self.expand_terms(terms, limit_per_term, include_neighbors))

    def text_to_vector(self, language, text):
        tokens = wordfreq.tokenize(text, language)
        weighted_terms = [(standardized_uri(language, token), 1.)
                          for token in tokens]
        return self.get_vector(weighted_terms, include_neighbors=False)

    def get_vector(self, query, include_neighbors=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `include_neighbors=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()
        if isinstance(query, pd.DataFrame) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, str):
            terms = [(query, 1.)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))
        include_neighbors = include_neighbors and (len(terms) <= 5)
        vec = self.expanded_vector(terms, include_neighbors=include_neighbors)
        return normalize_vec(vec)

    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a DataFrame of terms ranked by their similarity to the query.
        The query can be:

        - A DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[:self.small_k]
        search_frame = self.small_frame
        if filter:
            exact_only = filter.count('/') >= 3
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx:idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_key = filter
                # '0' is the character after '/', so end_key is the first possible
                # key that's not a descendant of the given filter key
                end_key = filter + '0'
                start_idx = search_frame.index.get_loc(start_key,
                                                       method='ffill')
                end_idx = search_frame.index.get_loc(end_key, method='bfill')
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame,
                                        small_vec,
                                        limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f'))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar

    def get_similarity(self, query1, query2):
        vec1 = self.get_vector(query1)
        vec2 = self.get_vector(query2)
        return cosine_similarity(vec1, vec2)
Exemplo n.º 4
0
class VectorSpaceWrapper(object):
    """
    An object that wraps the data necessary to look up vectors for terms
    (in the vector space named Conceptnet Numberbatch) and find related terms.

    The filenames usually don't need to be specified, because the system will
    look in default locations for them. They can be specified to replace them
    with toy versions for testing, or to evaluate how other embeddings perform
    while still using ConceptNet for looking up words outside their vocabulary.
    """

    def __init__(self, vector_filename=None, frame=None, use_db=True):
        if frame is None:
            self.frame = None
            self.vector_filename = vector_filename or get_data_filename(
                'vectors/mini.h5'
            )
        else:
            self.frame = frame
            self.vector_filename = None
        self.small_frame = None
        self.k = None
        self.small_k = None
        self.finder = None
        self.trie = None
        self.cache = {}
        if use_db:
            self.finder = AssertionFinder()

    def load(self):
        """
        Ensure that all the data is loaded.
        """
        if self.small_frame is not None:
            return
        try:
            if self.frame is None:
                self.frame = load_hdf(self.vector_filename)

            if not self.frame.index[1].startswith('/c/'):
                # These terms weren't in ConceptNet standard form. Assume
                # they're in English, and stick the English language tag on
                # them without any further transformation, so we can be sure
                # we're evaluating the vectors as provided.
                self.finder = None
                self.frame.index = ['/c/en/' + label for label in self.frame.index]

            if not self.frame.index.is_monotonic_increasing:
                self.frame = self.frame.sort_index()

            self.k = self.frame.shape[1]
            self.small_k = 100
            self.small_frame = self.frame.iloc[:, : self.small_k].copy()
        except OSError:
            raise MissingVectorSpace(
                "Couldn't load the vector space %r. Do you need to build or "
                "download it?" % self.vector_filename
            )
        self._build_trie()

    def _build_trie(self):
        """
        Build a trie (a prefix tree) that allows finding terms by their
        prefixes.
        """
        self._trie = marisa_trie.Trie(list(self.frame.index))

    @staticmethod
    def passes_filter(label, filter):
        if filter is None:
            return True
        else:
            return field_match(label, filter)

    @staticmethod
    def _englishify(term):
        splits = split_uri(term)
        if len(splits) > 2:
            englishified = '/c/en/' + splits[2]
            return englishified

    def _find_neighbors(self, term, limit_per_term, weight):
        neighbors = []
        for edge in self.finder.lookup(term, limit=limit_per_term):
            if field_match(edge['start']['term'], term) and not field_match(
                edge['end']['term'], term
            ):
                neighbor = edge['end']['term']
            elif field_match(edge['end']['term'], term) and not field_match(
                edge['start']['term'], term
            ):
                neighbor = edge['start']['term']
            else:
                continue
            neighbor_weight = weight * min(10, edge['weight']) * 0.01
            neighbors.append((neighbor, neighbor_weight))
        return neighbors

    def _match_prefix(self, term, prefix_weight):
        results = []
        while term:
            # Skip excessively general lookups, for either an entire
            # language, or all terms starting with a single
            # non-ideographic letter
            if (
                len(split_uri(term)) < 3
                or term.endswith('/')
                or (term[-2] == '/' and term[-1] < chr(0x3000))
            ):
                break
            prefixed = self._terms_with_prefix(term)
            if prefixed:
                n_prefixed = len(prefixed)
                for prefixed_term in prefixed:
                    results.append((prefixed_term, prefix_weight / n_prefixed))
                break
            term = term[:-1]
        return results

    def expand_terms(self, terms, limit_per_term=10, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if oov_vector and term not in self.frame.index and self.finder is not None:
                neighbors = self._find_neighbors(term, limit_per_term, weight)
                expanded.extend(neighbors)

                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = self._englishify(term)
                    expanded.append((englishified, prefix_weight))

                prefix_matches = self._match_prefix(term, prefix_weight)
                expanded.extend(prefix_matches)

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [
                (uri_prefix(term), weight / total_weight) for (term, weight) in expanded
            ]

    def expanded_vector(self, terms, limit_per_term=10, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, make a vector
        representing information from:

        - The vectors for these terms
        - The vectors for their neighbors in ConceptNet
        - The vectors for terms that share a sufficiently-long prefix with
          any terms in this list that are out-of-vocabulary
        """
        self.load()
        return weighted_average(
            self.frame, self.expand_terms(terms, limit_per_term, oov_vector)
        )

    def text_to_vector(self, language, text):
        """
        Used in Story Cloze Test to create a vector for text.
        """
        tokens = wordfreq.tokenize(text, language)
        weighted_terms = [
            (uri_prefix(standardized_uri(language, token)), 1.) for token in tokens
        ]
        return self.get_vector(weighted_terms, oov_vector=False)

    def get_vector(self, query, oov_vector=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `oov_vector=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()

        if isinstance(query, np.ndarray):
            return query
        elif isinstance(query, pd.Series) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, pd.DataFrame):
            terms = list(query.to_records())
        elif isinstance(query, str):
            terms = [(query, 1.)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))

        cache_key = tuple(terms + [oov_vector])
        if cache_key in self.cache:
            return self.cache[cache_key]

        oov_vector = oov_vector and (len(terms) <= 5)

        vec = self.expanded_vector(terms, oov_vector=oov_vector)
        self.cache[cache_key] = normalize_vec(vec)
        return self.cache[cache_key]

    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A pandas Series of weighted terms
        - A pandas DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term
        - An existing vector

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[: self.small_k]
        search_frame = self.small_frame
        # TODO: document filter
        if filter:
            exact_only = filter.count('/') >= 3
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx : idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_idx, end_idx = self._index_prefix_range(filter + '/')
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f')
        )

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar

    def get_similarity(self, query1, query2):
        vec1 = self.get_vector(query1)
        vec2 = self.get_vector(query2)
        return cosine_similarity(vec1, vec2)

    def _terms_with_prefix(self, prefix):
        """
        Get a list of terms whose URI begins with the given prefix. The list
        will be in an arbitrary order.
        """
        return self._trie.keys(prefix)

    def _index_prefix_range(self, prefix):
        """
        Get the range of indices on the DataFrame we're wrapping that begin
        with a given prefix.

        The range is a pair of index numbers. Following the convention of
        Python ranges, the starting index is inclusive, while the end index
        is exclusive.

        Returns the empty range (0, 0) if no terms begin with this prefix.
        """
        # Use the trie to find all terms with the given prefix. Then sort them,
        # because the range will span from our first prefix in sorted
        # order to just after our last.
        terms = sorted(self._terms_with_prefix(prefix))
        if not terms:
            return (0, 0)

        start_loc = self.frame.index.get_loc(terms[0])
        end_loc = self.frame.index.get_loc(terms[-1]) + 1
        return start_loc, end_loc
Exemplo n.º 5
0
class VectorSpaceWrapper(object):
    """
    An object that wraps the data necessary to look up vectors for terms
    (in the vector space named Conceptnet Numberbatch) and find related terms.

    The filenames usually don't need to be specified, because the system will
    look in default locations for them. They can be specified to replace them
    with toy versions for testing, or to evaluate how other embeddings perform
    while still using ConceptNet for looking up words outside their vocabulary.
    """
    def __init__(self, vector_filename=None, frame=None, use_db=True):
        if frame is None:
            self.frame = None
            self.vector_filename = vector_filename or get_data_filename('vectors/mini.h5')
        else:
            self.frame = frame
            self.vector_filename = None
        self.small_frame = None
        self.k = None
        self.small_k = None
        self.finder = None
        if use_db:
            self.finder = AssertionFinder()

    def load(self):
        """
        Ensure that all the data is loaded.
        """
        if self.small_frame is not None:
            return
        try:
            if self.frame is None:
                self.frame = load_hdf(self.vector_filename)

            if not self.frame.index[0].startswith('/c/'):
                # These terms weren't in ConceptNet standard form. Assume
                # they're in English, and stick the English language tag on
                # them without any further transformation, so we can be sure
                # we're evaluating the vectors as provided.
                self.finder = None
                self.frame.index = [
                    '/c/en/' + label
                    for label in self.frame.index
                ]

            self.k = self.frame.shape[1]
            self.small_k = 100
            self.small_frame = self.frame.iloc[:, :self.small_k].copy()
        except OSError:
            raise MissingVectorSpace(
                "Couldn't load the vector space %r. Do you need to build or "
                "download it?" % self.vector_filename
            )

    @staticmethod
    def passes_filter(label, filter):
        if filter is None:
            return True
        else:
            return field_match(label, filter)

    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors. This forms a reasonable
        approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            # TODO: this disagrees with the docstring about whether neighbors
            # are added to non-OOV terms
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge['start']['term'], term) and not field_match(edge['end']['term'], term):
                        neighbor = edge['end']['term']
                    elif field_match(edge['end']['term'], term) and not field_match(edge['start']['term'], term):
                        neighbor = edge['start']['term']
                    else:
                        continue
                    # TODO: explain this formula
                    neighbor_weight = weight * min(10, edge['weight']) * 0.01
                    expanded.append((neighbor, neighbor_weight))

                prefix_weight = 0.01
                if not term.startswith('/c/en/'):
                    # FIXME: better language code handling
                    englishified = '/c/en/' + term[6:]
                    expanded.append((englishified, prefix_weight))

                while term:
                    if term.endswith('/'):
                        break
                    start_idx, end_idx = index_prefix_range(self.frame, term)
                    if end_idx > start_idx:
                        n_prefixed = end_idx - start_idx
                        for prefixed_term in self.frame.index[start_idx:end_idx]:
                            expanded.append((prefixed_term, prefix_weight / n_prefixed))
                        break
                    term = term[:-1]

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]

    def expanded_vector(self, terms, limit_per_term=10, include_neighbors=True):
        # TODO: docstring
        self.load()
        return weighted_average(
            self.frame,
            self.expand_terms(terms, limit_per_term, include_neighbors)
        )

    def text_to_vector(self, language, text):
        # TODO: docstring -- is this only used for Story Cloze Test?
        tokens = wordfreq.tokenize(text, language)
        weighted_terms = [(standardized_uri(language, token), 1.) for token in tokens]
        return self.get_vector(weighted_terms, include_neighbors=False)

    def get_vector(self, query, include_neighbors=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `include_neighbors=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()
        # FIXME: is pd.DataFrame supposed to be pd.Series here?
        if isinstance(query, np.ndarray):
            return query
        elif isinstance(query, pd.DataFrame) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, str):
            terms = [(query, 1.)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))
        include_neighbors = include_neighbors and (len(terms) <= 5)
        vec = self.expanded_vector(terms, include_neighbors=include_neighbors)
        return normalize_vec(vec)

    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term
        - An existing vector

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.

        TODO: is this sometimes returning a DataFrame? Should it accept a
        Series as well as a DataFrame?
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[:self.small_k]
        search_frame = self.small_frame
        if filter:
            exact_only = filter.count('/') >= 3
            # TODO: Is this duplicating something that field_match was supposed
            # to do?
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx:idx+1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_idx, end_idx = index_prefix_range(search_frame, filter + '/')
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50)
        similar_choices = l2_normalize_rows(self.frame.loc[similar_sloppy.index].astype('f'))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar

    def get_similarity(self, query1, query2):
        vec1 = self.get_vector(query1)
        vec2 = self.get_vector(query2)
        return cosine_similarity(vec1, vec2)
Exemplo n.º 6
0
class VectorSpaceWrapper(object):
    """
    An object that wraps the data necessary to look up vectors for terms
    (in the vector space named Conceptnet Numberbatch) and find related terms.

    The filenames usually don't need to be specified, because the system will
    look in default locations for them. They can be specified to replace them
    with toy versions for testing.
    """

    def __init__(self, vector_filename=None, frame=None, use_db=True):
        if frame is None:
            self.frame = None
            self.vector_filename = vector_filename or get_data_filename("vectors/mini.h5")
        else:
            self.frame = frame
            self.vector_filename = None
        self.small_frame = None
        self.k = None
        self.small_k = None
        self.finder = None
        self.standardized = None
        if use_db:
            self.finder = AssertionFinder()

    def load(self):
        """
        Ensure that all the data is loaded.
        """
        if self.small_frame is not None:
            return
        try:
            if self.frame is None:
                self.frame = load_hdf(self.vector_filename)

            if self.frame.index[0].startswith("/c/"):
                self.standardized = True
            else:
                # These terms weren't in ConceptNet standard form. Assume
                # they're in English, and stick the English language tag on
                # them without any further transformation, so we can be sure
                # we're evaluating the vectors as provided.
                self.standardized = False
                self.finder = None
                self.frame.index = ["/c/en/" + label for label in self.frame.index]

            self.k = self.frame.shape[1]
            self.small_k = 100
            self.small_frame = self.frame.iloc[:, : self.small_k].copy()
        except OSError:
            raise MissingVectorSpace(
                "Couldn't load the vector space %r. Do you need to build or " "download it?" % self.vector_filename
            )

    @staticmethod
    def passes_filter(label, filter):
        if filter is None:
            return True
        else:
            return field_match(label, filter)

    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors. This forms a reasonable
        approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            expanded.append((term, weight / 10))
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge["start"]["term"], term) and not field_match(edge["end"]["term"], term):
                        neighbor = edge["end"]["term"]
                    elif field_match(edge["end"]["term"], term) and not field_match(edge["start"]["term"], term):
                        neighbor = edge["start"]["term"]
                    else:
                        continue
                    neighbor_weight = weight * min(10, edge["weight"]) * 0.001
                    expanded.append((neighbor, neighbor_weight))

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]

    def expanded_vector(self, terms, limit_per_term=10, include_neighbors=True):
        self.load()
        return weighted_average(self.frame, self.expand_terms(terms, limit_per_term, include_neighbors))

    def text_to_vector(self, language, text):
        tokens = wordfreq.tokenize(text, language)
        weighted_terms = [(standardized_uri(language, token), 1.0) for token in tokens]
        return self.get_vector(weighted_terms, include_neighbors=False)

    def get_vector(self, query, include_neighbors=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `include_neighbors=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()
        if isinstance(query, pd.DataFrame) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, str):
            terms = [(query, 1.0)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))
        include_neighbors = include_neighbors and (len(terms) <= 5)
        vec = self.expanded_vector(terms, include_neighbors=include_neighbors)
        return normalize_vec(vec)

    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a DataFrame of terms ranked by their similarity to the query.
        The query can be:

        - A DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[: self.small_k]
        search_frame = self.small_frame
        if filter:
            exact_only = filter.count("/") >= 3
            if filter.endswith("/."):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx : idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_key = filter
                # '0' is the character after '/', so end_key is the first possible
                # key that's not a descendant of the given filter key
                end_key = filter + "0"
                try:
                    start_idx = search_frame.index.get_loc(start_key, method="bfill")
                except KeyError:
                    start_idx = len(search_frame.index)
                try:
                    end_idx = search_frame.index.get_loc(end_key, method="bfill")
                except KeyError:
                    end_idx = len(search_frame.index)
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50)
        similar_choices = l2_normalize_rows(self.frame.loc[similar_sloppy.index].astype("f"))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar

    def get_similarity(self, query1, query2):
        vec1 = self.get_vector(query1)
        vec2 = self.get_vector(query2)
        return cosine_similarity(vec1, vec2)