def lookup_in_taxonomy(self):
        """
        from wikipedia slugs fetch relatedConcepts in the taxonomy
        :param results: a list of wikipedia slugs
        :return: a set of related concepts to the slugs
        """
        from unidecode import unidecode

        base_url = "http://taxonomy.projectchronos.eu/space/dbpediadocs/{}"
        labels = []
        resource = None
        for res in self.spotted:
            #res = unidecode(res)
            try:
                # print base_url.format(res)
                resource = retrieve_json(base_url.format(res))
            except Exception as e:
                print Exception('lookup_in_taxonomy(): Cannot fetch taxonomy: ' + res.encode('ascii', 'replace') + ' ' + str(e))

            if resource and 'relatedConcepts' in resource.keys():
                for c in resource['relatedConcepts']:
                    if c:
                        try:
                            resource = retrieve_json(c)
                        except Exception:
                            try:  # see bottom of the module
                                kwd = c[c.rfind('/') + 1:].replace("+", " ").strip()
                                # print kwd, kwd in to_be_corrected.keys()
                                taxonomy = "http://taxonomy.projectchronos.eu/concepts/c/{}"
                                if kwd in to_be_corrected.keys():
                                    kwd = to_be_corrected[kwd].replace(" ", "+")
                                    # print kwd
                                else:
                                    kwd = kwd.replace(" ", "+")
                                try:
                                    resource = retrieve_json(taxonomy.format(kwd))
                                except Exception as e:
                                    raise ValueError("Cannot deduce keyword. Cannot fetch relatedConcepts: " + str(e))
                            except ValueError as e:
                                print str(e)
                                continue
                            except Exception as e:
                                print Exception('lookup_in_taxonomy(): ' + c + ' Concept is not in the space api ' + str(e))
                                continue

                        # Resource is found in the taxonomy
                        label = resource['label']
                        labels.append(str(label))
        return set(labels)
        def get_ancestor(obj, ancestors=tuple()):
            if 'ancestor' not in obj.keys():
                return ancestors

            new_obj = retrieve_json(obj['ancestor'])
            ancestors += (new_obj['label'], )
            return get_ancestor(new_obj, ancestors)
    def find_term_ancestorship(cls, term):
        """
        Return the genealogy of a term in the Taxonomy.

        :param term: a term in the taxonomy
        :return: a dictionary
        """
        term = term.replace(" ", "+").lower()
        print term
        base_url = "http://taxonomy.projectchronos.eu/concepts/c/{}"
        try:
            resource = retrieve_json(base_url.format(term))
        except Exception:
            raise ValueError("find_term_ancestorship(): term is not in the taxonomy. Wrong term.")

        def get_ancestor(obj, ancestors=tuple()):
            if 'ancestor' not in obj.keys():
                return ancestors

            new_obj = retrieve_json(obj['ancestor'])
            ancestors += (new_obj['label'], )
            return get_ancestor(new_obj, ancestors)

        return {
            "slug": term,
            "term": term.replace("+", " "),
            "ancestors": list(get_ancestor(resource))[::-1]
        }
def lookup_in_taxonomy(results):
    """
    from wikipedia slugs fetch relatedConcepts in the taxonomy
    :param results: a list of wikipedia slugs
    :return: a set of related concepts to the slugs
    """
    from unidecode import unidecode

    base_url = "http://taxonomy.projectchronos.eu/space/dbpediadocs/{}"
    labels = []
    resource = None
    for res in results:
        res = unidecode(res)
        try:
            # print base_url.format(res)
            resource = retrieve_json(base_url.format(res))
        except Exception as e:
            print Exception('Cannot fetch taxonomy: ' +
                            res.encode('ascii', 'replace') + ' ' + str(e))

        if resource and 'relatedConcepts' in resource.keys():
            for c in resource['relatedConcepts']:
                if c:
                    label = c[c.rfind('/') + 1:].replace('+', ' ')
                    # print 'Found! ' + label
                    labels.append(str(label))
    return set(labels)
Пример #5
0
def lookup_in_taxonomy(results):
    """
    from wikipedia slugs fetch relatedConcepts in the taxonomy
    :param results: a list of wikipedia slugs
    :return: a set of related concepts to the slugs
    """
    from unidecode import unidecode

    base_url = "http://taxonomy.projectchronos.eu/space/dbpediadocs/{}"
    labels = []
    resource = None
    for res in results:
        res = unidecode(res)
        try:
            # print base_url.format(res)
            resource = retrieve_json(base_url.format(res))
        except Exception as e:
            print Exception('Cannot fetch taxonomy: ' + res.encode('ascii', 'replace') + ' ' + str(e))

        if resource and 'relatedConcepts' in resource.keys():
            for c in resource['relatedConcepts']:
                if c:
                    label = c[c.rfind('/') + 1:].replace('+', ' ')
                    # print 'Found! ' + label
                    labels.append(str(label))
    return set(labels)
    def relate(cls, titles, comparing=None, min_rho=0.42):
        """
        Implement the TagMe's 'relating API'
        :param titles: a string that is wikipedia title or a list of titles
        :param comparing: a string that is wikipedia title or a list of titles
        :param min_rho: the minimum rho to filter the relation
        :return: a list of results objects with "rel" > min_rho
        """
        endpoint = "http://tagme.di.unipi.it/rel"
        if comparing is None:
            comparing = cls.return_gen_scopes()

        params = {
            "key": api_key,
            "tt": []
        }

        if isinstance(titles, list):
            if len(titles) > 1:
                if isinstance(comparing, list):
                    [params["tt"].append(t + ' ' + c) for c in comparing for t in titles if t != c]
                elif isinstance(comparing, str):
                    [params["tt"].append(t + ' ' + comparing) for t in titles if t != comparing]
            elif len(titles) == 1:
                if isinstance(comparing, list):
                    [params["tt"].append(titles[0] + ' ' + c) for c in comparing if titles[0] != c]
                elif isinstance(comparing, str):
                    params["tt"].append(titles[0] + ' ' + comparing)
            else:
                raise ConceptNotInDBpedia('This term does not have annotations, so it cannot be searched for related')
        else:
            if isinstance(comparing, list):
                [params["tt"].append(titles + ' ' + c) for c in comparing if titles != c]
            elif isinstance(comparing, str):
                params["tt"].append(titles + ' ' + comparing)

        url = endpoint + '?' + urllib.urlencode(params, True)

        try:
            results = retrieve_json(url)
            #pprint(results)
        except (Exception, ValueError):
            raise BadRequest('Error in connection or in JSON parsing the response from TagMe Relating API')

        try:
            if int(results["errors"]) == 0 and len(results["result"]) != 0:
                output = [r for r in results["result"] if float(r["rel"]) > min_rho]
                return output
            else:
                for r in results["result"]:
                    if 'err' in r.keys():
                        print(BadRequest("Error in request data to TagMe: " + str(r['err'] + " in " + str(r))))
                        return None
        except (Exception, KeyError) as e:
            raise e

        raise BadRequest('TagMe API responded with an error: ' + str(results))
    def relate(cls, titles, comparing=None, min_rho=0.42):
        """
        Implement the TagMe's 'relating API'
        :param titles: a string that is wikipedia title or a list of titles
        :param comparing: a string that is wikipedia title or a list of titles
        :param min_rho: the minimum rho to filter the relation
        :return: a list of results objects with "rel" > min_rho
        """
        endpoint = "http://tagme.di.unipi.it/rel"
        if comparing is None:
            comparing = cls.return_gen_scopes()

        params = {
            "key": api_key,
            "tt": []
        }

        if isinstance(titles, list):
            if len(titles) > 1:
                if isinstance(comparing, list):
                    [params["tt"].append(t + ' ' + c) for c in comparing for t in titles if t != c]
                elif isinstance(comparing, str):
                    [params["tt"].append(t + ' ' + comparing) for t in titles if t != comparing]
            elif len(titles) == 1:
                if isinstance(comparing, list):
                    [params["tt"].append(titles[0] + ' ' + c) for c in comparing if titles[0] != c]
                elif isinstance(comparing, str):
                    params["tt"].append(titles[0] + ' ' + comparing)
            else:
                raise ConceptNotInDBpedia('This term does not have annotations, so it cannot be searched for related')
        else:
            if isinstance(comparing, list):
                [params["tt"].append(titles + ' ' + c) for c in comparing if titles != c]
            elif isinstance(comparing, str):
                params["tt"].append(titles + ' ' + comparing)

        url = endpoint + '?' + urllib.urlencode(params, True)

        try:
            results = retrieve_json(url)
            #pprint(results)
        except (Exception, ValueError):
            raise BadRequest('Error in connection or in JSON parsing the response from TagMe Relating API')

        try:
            if int(results["errors"]) == 0 and len(results["result"]) != 0:
                output = [r for r in results["result"] if float(r["rel"]) > min_rho]
                return output
            else:
                for r in results["result"]:
                    if 'err' in r.keys():
                        print(BadRequest("Error in request data to TagMe: " + str(r['err'] + " in " + str(r))))
                        return None
        except (Exception, KeyError) as e:
            raise e

        raise BadRequest('TagMe API responded with an error: ' + str(results))
    def retrieve_taggings(term, method='GET'):
        """
        Find annotations of a given concept/word/sets of words
        'http://tagme.di.unipi.it/tag?key=****&text=Recent+poll+show+President+Obama+opening+up+a+small+lead'
        :param term: the text to analyze as a byte (b'string')
        :return: dictionary with loaded JSON from response
        """
        endpoint = "http://tagme.di.unipi.it/tag"
        # print(term)
        if str(term) == 'None':
            return {"timestamp": time.time(), "time": 0, "api": "tag", "annotations": [], "lang": "en"}
        params = {
            "key": api_key,
            "text": term,
            "include_categories": True,
            "include_abstract": True,
            "include_all_spots": True
        }
        if method == 'GET':
            url = endpoint + '?' + urllib.urlencode(params)
            data = None
        # print(url)
        elif method == 'POST':
            url = endpoint
            data = params
        else:
            raise BadRequest('retrieve_taggings(): Wrong HTTP Verb')

        results = {"timestamp": time.time(), "time": 0, "api": "tag", "annotations": [], "lang": "en"}
        try:
            results = retrieve_json(url, method=method, data=data)
        except (Exception, ValueError):
            return results

        if "errors" not in results.keys() or results["errors"] == 0:
            return results

        raise BadRequest('retrieve_taggings() Failed')
    def retrieve_taggings(term, method='GET'):
        """
        Find annotations of a given concept/word/sets of words
        'http://tagme.di.unipi.it/tag?key=****&text=Recent+poll+show+President+Obama+opening+up+a+small+lead'
        :param term: the text to analyze as a byte (b'string')
        :return: dictionary with loaded JSON from response
        """
        endpoint = "http://tagme.di.unipi.it/tag"
        # print(term)
        if str(term) == 'None':
            return {"timestamp": time.time(), "time": 0, "api": "tag", "annotations": [], "lang": "en"}
        params = {
            "key": api_key,
            "text": term,
            "include_categories": True,
            "include_abstract": True,
            "include_all_spots": True
        }
        if method == 'GET':
            url = endpoint + '?' + urllib.urlencode(params)
            data = None
        # print(url)
        elif method == 'POST':
            url = endpoint
            data = params
        else:
            raise BadRequest('retrieve_taggings(): Wrong HTTP Verb')

        results = {"timestamp": time.time(), "time": 0, "api": "tag", "annotations": [], "lang": "en"}
        try:
            results = retrieve_json(url, method=method, data=data)
        except (Exception, ValueError):
            return results

        if "errors" not in results.keys() or results["errors"] == 0:
            return results

        raise BadRequest('retrieve_taggings() Failed')