Пример #1
0
 def __init__(self, location='us', key=None):
     """ init """
     assert (location in AMAZON_LOCATIONS)
     self.retrieve = Retrieve(self.__class__.__name__)
     self.wsBase = AMAZON_LOCATIONS[location]
     self.accessKey = key or AMAZON_ACCESS_KEY
     self.amazon_url = AmazonUrl()
Пример #2
0
    def getRelatedTags(tags):
        """ fetches the related tags for the given tags
            @param list of tags
            @return dictionary of related tags with count
        """

        if type(tags).__name__ == 'str':
            url = Twitter.TWITTER_SEARCH_URL % tags
        else:
            url = Twitter.TWITTER_SEARCH_URL % "+".join(tags)

        f = Retrieve(Twitter.__name__).open(url)

        # convert json into dict and remove null values with ""
        search_results = eval(re.sub('null', '""', f.read()))
        found_tags = []
        related_tags = {}

        for result in search_results['results']:
            found_tags.extend(Twitter.RE_FIND_TAGS.findall(result['text']))

        for tag in found_tags:
            related_tags[tag.lower()] = related_tags.get(tag.lower(), 0) + 1

        # todo: sort

        return related_tags
Пример #3
0
    def getRelatedTags( tags ):
        """ fetches the related tags for the given tags
            @param list of tags
            @return dictionary of related tags with count
        """

        if type(tags).__name__ == 'str':
            url = Twitter.TWITTER_SEARCH_URL % tags
        else:   
            url = Twitter.TWITTER_SEARCH_URL % "+".join(tags)

        f = Retrieve(Twitter.__name__).open(url)

        # convert json into dict and remove null values with ""
        search_results = eval(re.sub('null', '""', f.read()))
        found_tags = []
        related_tags = {}

        for result in search_results['results']:
            found_tags.extend(Twitter.RE_FIND_TAGS.findall( result['text']))

        for tag in found_tags:
            related_tags[tag.lower()] = related_tags.get(tag.lower(), 0) + 1

        # todo: sort

        return related_tags
Пример #4
0
def get_new_access_token(client_id=FACEBOOK_APPLICATION_ID,
                         client_secret=FACEBOOK_SECRET_KEY,
                         access_token=FACEBOOK_ACCESS_KEY):
    ''' '''
    url = API_URL.format(client_id=client_id,
                         client_secret=client_secret,
                         access_token=access_token)
    
    retrieve = Retrieve('fb')
    x = retrieve.open(url)
    result = x.read() 
    new_access_token = access_token
    
    for key, param in urlparse.parse_qs(result).iteritems():
        print key, param
        if key == 'access_token':
            if isinstance(param, list):
                param = param[0]
            
            if param == access_token:
                print 'access token still the same'
            else: 
                print 'got new access_token %s' % param
                new_access_token = param
                
    return new_access_token
Пример #5
0
    def get_content(url):
        """ returns the content from Flickr """
        assert(url.startswith("http"))

        f = Retrieve(Flickr.__name__).open(url)
        content = f.read()
        f.close()
        return content
Пример #6
0
    def get_content(url):
        """ returns the content from Flickr """
        assert (url.startswith("http"))

        f = Retrieve(Flickr.__name__).open(url)
        content = f.read()
        f.close()
        return content
Пример #7
0
    def _get_content(url):
        """ returns the content from delicious """
        assert(url.startswith("http"))

        f = Retrieve(Delicious.__name__).open(url)
        content = f.read()
        f.close()
        sleep(1)
        return content
Пример #8
0
    def testRetrieval(self):
        ''' tries to retrieve the following url's from the list '''

        r_handler = Retrieve(self.__class__.__name__)
        for url in self.TEST_URLS:
            print(url)
            r = r_handler.open(url)
            r.read()
            r.close()
Пример #9
0
    def testRetrievalTimeout(self):
        ''' tests whether the socket timeout is honored by our class '''
        SLOW_URL = "http://www.csse.uwa.edu.au/"

        with raises(urllib2.URLError):
            r = Retrieve(self.__class__.__name__,
                         default_timeout=0.1).open(SLOW_URL)
            content = r.read()
            r.close()
Пример #10
0
    def _get_content( url ):
        """ returns the content from delicious """
        assert( url.startswith("http") )

        f = Retrieve(Delicious.__name__).open(url)
        content = f.read()
        f.close()
        sleep(1)
        return content
Пример #11
0
    def __init__(self, user, password,
                 local_dir=LOCAL_DIR,
                 server_url=SERVER_URL,
                 max_age_hours=MAX_AGE_HOURS):

        if not os.path.exists(local_dir):
            os.makedirs(local_dir)
        self.max_file_age = datetime.now() - timedelta(hours=max_age_hours)
        self.local_dir = local_dir
        self.server_url = server_url
        self.retrieve = Retrieve(__file__)
        self.user = user
        self.password = password
Пример #12
0
def t_retrieve(url):
    ''' retrieves the given url from the web

        @remarks
        helper module for the testMultiProcessing unit test.
    '''
    r = Retrieve(__name__).open(url)
    try:
        content = r.read()
    finally:
        # this is required as GzipFile does not support the context protocol
        # in python 2.6
        r.close()
    return content
Пример #13
0
    def fix_urls(cls, urls, user=None, password=None):
        ''' fixes the urls and put them into the correct format, to maintain
        the compability to the remaining platform
        :param urls: service urls
        :type urls: string or list or tuple
        :param user: username
        :param password: password
        :returns: correctly formated urls
        :rtype: list
        '''
        correct_urls = []

        if isinstance(urls, string_types):
            urls = [urls]

        for url in urls:
            if not url.endswith('/'):
                url = '%s/' % url

            if not 'rest' in url:
                if cls.URL_PATH and not url.endswith(cls.URL_PATH):
                    if cls.URL_PATH.startswith('/'):
                        cls.URL_PATH = cls.URL_PATH[1:]
                    url = '%s%s' % (url, cls.URL_PATH)

            if user and password:
                url = Retrieve.add_user_password(url, user, password)

            correct_urls.append(url)

        return correct_urls
Пример #14
0
    def __init__(self,
                 service_url,
                 user=None,
                 password=None,
                 authentification_method='basic',
                 module_name='eWRT.REST',
                 default_timeout=WS_DEFAULT_TIMEOUT):
        ''' :param service_url: the base url of the web service
            :param modul_name: the module name to add to the USER AGENT
                               description (optional)
            :param user: username
            :param password: password
            :param authentification_method: authentification method to use
                                            ('basic'*, 'digest').
        '''
        # remove superfluous slashes, if required
        self.service_url = service_url[:-1] if service_url.endswith("/") \
            else service_url
        self.user = user
        self.password = password

        if not default_timeout:
            default_timeout = WS_DEFAULT_TIMEOUT

        url_obj = Retrieve(module_name,
                           sleep_time=0,
                           default_timeout=default_timeout)
        self.retrieve = partial(
            url_obj.open,
            user=user,
            pwd=password,
            authentification_method=authentification_method)
Пример #15
0
 def __init__(self, location='us', key=None):
     """ init """
     assert (location in AMAZON_LOCATIONS)
     self.retrieve = Retrieve(self.__class__.__name__)
     self.wsBase = AMAZON_LOCATIONS[location]
     self.accessKey = key or AMAZON_ACCESS_KEY
     self.amazon_url = AmazonUrl()
Пример #16
0
def retrieve_conceptnet_query_result(query):
    ''' ::param url: the url to retrieve
        ::return: the json response to the given conceptnet query
    '''
    with Retrieve(__name__) as r:
        c = r.open(query, retry=3)
        return c.read()
Пример #17
0
    def fix_urls(cls, urls, user=None, password=None):
        ''' fixes the urls and put them into the correct format, to maintain
        the compability to the remaining platform
        :param urls: service urls
        :type urls: string or list or tuple
        :param user: username
        :param password: password
        :returns: correctly formated urls
        :rtype: list
        '''
        correct_urls = []

        if isinstance(urls, string_types):
            urls = [urls]

        for url in urls:
            if not url.endswith('/'):
                url = '%s/' % url

            if not 'rest' in url:
                if cls.URL_PATH and not url.endswith(cls.URL_PATH):
                    if cls.URL_PATH.startswith('/'):
                        cls.URL_PATH = cls.URL_PATH[1:]
                    url = '%s%s' % (url, cls.URL_PATH)

            if user and password:
                url = Retrieve.add_user_password(url, user, password)

            correct_urls.append(url)

        return correct_urls
Пример #18
0
    def get_content(url):
        """ returns the content from Technorati """
        assert(url.startswith("http"))

        logger.debug('Fetching content for URL %s' % url)

        if (time.time() - Technorati.last_access) < SLEEP_TIME:
            logger.debug('Sleeping %s seconds!' % SLEEP_TIME)
            time.sleep(SLEEP_TIME)

        Technorati.last_access = time.time()

        f = Retrieve("%s_new" % Technorati.__name__).open(url)
        content = f.read()
        f.close()
        return content
Пример #19
0
 def get_content(url, sleep_time=SLEEP_TIME):
     ''' fetches the content
     @param url: url to fetch
     @param sleep_time: time to sleep
     @return: HTML string'''
     assert (url.startswith("http"))
     return Retrieve("GoogleBlogSearch",
                     sleep_time=sleep_time).open(url).read()
Пример #20
0
class WikiPedia(object):
    """ returns an WikiPedia Object  """
    def __init__(self):
        self.r = Retrieve(WikiPedia.__name__)

    def getWikiPage(self, pageName, lang='en'):
        """ returns the given wikipedia page considering different spellings 
            @param[in] pageName
            @param[in] language (determines which wikipedia to query)
            @returns the page's wikipedia text
        """
        assert (len(lang) == 2)

        for pn in self._getPageNameAlterations(pageName):
            pageContent = self._retrievePage(pn, lang)
            if pageContent:
                return pageContent

        return None

    @staticmethod
    def _getPageNameAlterations(pageName):
        """ @returns a list of differnt names for the given page """

        alt = [
            pageName,
        ]
        if not ' ' in pageName:
            alt

        words = pageName.split(" ")
        alt.append(
            "%s %s" %
            (words[0].capitalize(), " ".join(map(str.lower, words[1:]))))
        return alt

    def _retrievePage(self, pageName, lang):
        """ retrieves the given Wiki page
            @param[in] pageName
            @param[in] language (determines which wikipedia to query)
            @returns the page's wikipedia text
        """
        param = urlencode({
            'action': 'query',
            'format': 'json',
            'export': '',
            'redirects': 'true',
            'titles': pageName
        })
        data = self.r.open(WIKIPEDIA_API_QUERY % lang, param).read()
        jsonData = eval(data)['query']
        if '-1' in jsonData['pages']:
            return None

        xmlData = jsonData['export']['*'].replace("\/", "/")
        return parseString(xmlData).getElementsByTagName(
            'text')[0].firstChild.data
Пример #21
0
    def testGettingUserPassword(self):
        urls = (('http://irgendwas.com', None, None),
                ('http://*****:*****@irgendwas.com', 'heinz', 'secret'))

        for test_url, exp_user, exp_passwd in urls:
            print('testing url ' + test_url)
            url, user, passwd = Retrieve.get_user_password(test_url)
            assert user == exp_user
            assert passwd == exp_passwd
            if user:
                assert url != test_url
Пример #22
0
def parse(url, last_modified=None):
    """ 
    Parses the given RSS Feed an returns all articles and the content of
    the page referenced in the <link> tag.
    
    @param url: the url of the rss feed
    @param last_modified: a datetime object that specifies the last time the
                          feed has been queried the last time (only newer 
                          entries are returned).  
    """
    feed = feedparser.parse(url, modified=last_modified)
    retrieve = Retrieve("rss", HTTP_FETCH_DELAY)

    result = []
    for item in feed['items']:
        if datetime.fromtimestamp(mktime(
                item['updated_parsed'])) > last_modified:
            item['content'] = retrieve.open(item['link']).read()
            result.append(item)

    return result
Пример #23
0
    def getNeighbors(geo_entity):
        """ returns all neighbours for the given geo id
            (currently only implemented on a country level)
            @param[in] geo_entity
            @returns a list containing the neighbours of the given country """

        url = GeoNames.NEIGHBOURS_SERVICE_URL % geo_entity.id
        jsonData = eval(Retrieve('eWRT.ws.geonames').open(url, retry=5).read())
        if 'geonames' in jsonData:
            return filter(None, [GeoNames.getGeoEntity(GeoEntity.factory(id=e['geonameId'])) for e in jsonData['geonames']])
        else:
            return []
Пример #24
0
def parse(url, last_modified=None):
    """ 
    Parses the given RSS Feed an returns all articles and the content of
    the page referenced in the <link> tag.
    
    @param url: the url of the rss feed
    @param last_modified: a datetime object that specifies the last time the
                          feed has been queried the last time (only newer 
                          entries are returned).  
    """
    feed = feedparser.parse(url, modified=last_modified)
    retrieve = Retrieve("rss", HTTP_FETCH_DELAY)
    
    result = []
    for item in feed['items']:
        if datetime.fromtimestamp(
                mktime(item['updated_parsed'])) > last_modified:
            item['content'] = retrieve.open(item['link']).read()
            result.append(item)

    return result
Пример #25
0
class WikiPedia(object):
    """ returns an WikiPedia Object  """

    def __init__(self):
        self.r = Retrieve( WikiPedia.__name__ )
    
    def getWikiPage(self, pageName, lang='en'):
        """ returns the given wikipedia page considering different spellings 
            @param[in] pageName
            @param[in] language (determines which wikipedia to query)
            @returns the page's wikipedia text
        """
        assert( len(lang)==2 )

        for pn in self._getPageNameAlterations( pageName ):
            pageContent = self._retrievePage( pn, lang )
            if pageContent:
                return pageContent

        return None

    @staticmethod
    def _getPageNameAlterations(pageName):
        """ @returns a list of differnt names for the given page """

        alt = [ pageName, ]
        if not ' ' in pageName:
            alt

        words = pageName.split(" ")
        alt.append( "%s %s" % (words[0].capitalize(), " ".join( map(str.lower, words[1:] ) )) )
        return alt

    def _retrievePage(self, pageName, lang):
        """ retrieves the given Wiki page
            @param[in] pageName
            @param[in] language (determines which wikipedia to query)
            @returns the page's wikipedia text
        """
        param = urlencode( {'action': 'query',
                            'format':'json', 
                            'export':'',
                            'redirects':'true',
                            'titles':pageName 
        })
        data = self.r.open( WIKIPEDIA_API_QUERY % lang, param ).read()
        jsonData = eval( data  )['query']
        if '-1' in jsonData['pages']:
            return None

        xmlData = jsonData['export']['*'].replace("\/","/")
        return parseString( xmlData  ).getElementsByTagName('text')[0].firstChild.data
Пример #26
0
    def _connect_clients(cls, service_urls, user=None, password=None,
                         default_timeout=WS_DEFAULT_TIMEOUT):

        clients = []

        if isinstance(service_urls, str):
            service_urls = [service_urls]

        for url in service_urls:
            service_url, user, password = Retrieve.get_user_password(url)

            clients.append(RESTClient(service_url=service_url,
                                      user=user,
                                      password=password,
                                      default_timeout=default_timeout))
        return clients
Пример #27
0
    def _connect_clients(cls, service_urls, user=None, password=None,
                         default_timeout=WS_DEFAULT_TIMEOUT):

        clients = []

        if isinstance(service_urls, str):
            service_urls = [service_urls]

        for url in service_urls:
            service_url, new_user, new_password = Retrieve.get_user_password(url)

            clients.append(RESTClient(service_url=service_url,
                                      user=user or new_user,
                                      password=password or new_password,
                                      default_timeout=default_timeout))
        return clients
Пример #28
0
class Yahoo(TagInfoService):
    """ interfaces with yahoo's search service 
        * Search: Yahoo! BOSS
          (see http://developer.yahoo.com/search/boss)
    """
    __slots__ = ('r', )

    def __init__(self):
        self.r = Retrieve( Yahoo.__name__, sleep_time=0 )

    def query(self, terms, count=0, queryParams={} ):
        """ returns search results for the given terms
            @param[in] terms       ... a list of search terms
            @param[in] count       ... number of results to return (0 if we are
                                       interested on the search meta data only).
            @param[in] queryParams ... a dictionary of query parameters to add to
                                          the request
            @returns the search results
        """
        assert ( isinstance(terms, tuple) or isinstance(terms, list) )
        queryParams.update( {'appid': YAHOO_APP_ID,
                             'count': count,
                             'format': 'json'
        } )
        params = urlencode( queryParams )
        url = YAHOO_SEARCH_URL % "%2B".join(map( quote, terms) ) +"?"+ params
        print(url)
        try:
            result = eval( self.r.open(url).read().replace("\\/", "/" ))
            return result['ysearchresponse']
        except (timeout, URLError):
            return ""

    @staticmethod
    def getSearchResults(query_result):
        """ returns a list of all search results returned by the given
            query result.
            @param[in] query_result     Result of the query
        """
        return [ YahooSearchResult(r) for r in query_result['resultset_web'] ] \
           if 'resultset_web' in query_result else []


    def getTagInfo(self, tag):
        """ @Override """
        return int( self.query(tag)['totalhits'] )
Пример #29
0
class Yahoo(TagInfoService):
    """ interfaces with yahoo's search service 
        * Search: Yahoo! BOSS
          (see http://developer.yahoo.com/search/boss)
    """
    __slots__ = ('r', )

    def __init__(self):
        self.r = Retrieve( Yahoo.__name__, sleep_time=0 )

    def query(self, terms, count=0, queryParams={} ):
        """ returns search results for the given terms
            @param[in] terms       ... a list of search terms
            @param[in] count       ... number of results to return (0 if we are
                                       interested on the search meta data only).
            @param[in] queryParams ... a dictionary of query parameters to add to
                                          the request
            @returns the search results
        """
        assert ( isinstance(terms, tuple) or isinstance(terms, list) )
        queryParams.update( {'appid': YAHOO_APP_ID,
                             'count': count,
                             'format': 'json'
        } )
        params = urlencode( queryParams )
        url = YAHOO_SEARCH_URL % "%2B".join(map( quote, terms) ) +"?"+ params
        print url
        try:
            result = eval( self.r.open(url).read().replace("\\/", "/" ))
            return result['ysearchresponse']
        except URLError:
            return ""

    @staticmethod
    def getSearchResults(query_result):
        """ returns a list of all search results returned by the given
            query result.
            @param[in] query_result     Result of the query
        """
        return [ YahooSearchResult(r) for r in query_result['resultset_web'] ] \
           if 'resultset_web' in query_result else []


    def getTagInfo(self, tag):
        """ @Override """
        return int( self.query(tag)['totalhits'] )
Пример #30
0
class YahooTermExtractor(object):
    """ interfaces with yahoo's search service 
        * Term extraction: extract terms from yahoo search
          http://developer.yahoo.com/search/content/V1/termExtraction.html
    """
    __slots__ = ('r', )

    def __init__(self):
        self.r = Retrieve( YahooTermExtractor.__name__ )

    def extractTerms(self, content):
        """ extract terms from yahoo search, see http://developer.yahoo.com/search/content/V1/termExtraction.html """ 

        params = urlencode( {'appid': YAHOO_APP_ID,
                             'context': content,
                             'output': 'json'
        })
        result = eval ( self.r.open(YAHOO_TERM_EXTRACTION_URI, params).read() )
        return result['ResultSet']['Result']
Пример #31
0
class YahooTermExtractor(object):
    """ interfaces with yahoo's search service 
        * Term extraction: extract terms from yahoo search
          http://developer.yahoo.com/search/content/V1/termExtraction.html
    """
    __slots__ = ('r', )

    def __init__(self):
        self.r = Retrieve( YahooTermExtractor.__name__ )

    def extractTerms(self, content):
        """ extract terms from yahoo search, see http://developer.yahoo.com/search/content/V1/termExtraction.html """ 

        params = urlencode( {'appid': YAHOO_APP_ID,
                             'context': content,
                             'output': 'json'
        })
        result = eval ( self.r.open(YAHOO_TERM_EXTRACTION_URI, params).read() )
        return result['ResultSet']['Result']
Пример #32
0
class WikiPedia(object):
    """ returns a wikipedia article """

    def __init__(self):
        self.r = Retrieve( WikiPedia.__name__ )
    
    def getDescriptor(self, synonym, lang='en'):
        """ returns the descriptor for the given synonym in the diven language """
        assert( len(lang)==2 )
        try:
            result = self.getWikipediaSearchResults(synonym, lang)
            return result[0]
        except (HTTPError, IndexError):
            return None


    def getWikipediaSearchResults(self, term, lang):
        """ returns a list of wikipedia search results for the given term 
            or None if nothing was found 
        """
        search_query = WIKIPEDIA_SEARCH_QUERY % (lang, quote(term) )
        f=self.r.open(search_query)
        results = WikiPedia._parse_wikipedia_search_results( f.read() )
        f.close()

        return results

    @staticmethod
    def _parse_wikipedia_search_results( text ):
        result = []
        for line in text.split("\n"):
            # only consider lines containing search results
            if not "class='searchresult'" in line: continue

            (prefix, tmp) = line.split("title=\"", 1)
            (descriptor, suffix ) = tmp.split("\"", 1)

            result.append(descriptor)

        return result
Пример #33
0
class WikiPedia(object):
    """ returns a wikipedia article """

    def __init__(self):
        self.r = Retrieve(WikiPedia.__name__)

    def getDescriptor(self, synonym, lang='en'):
        """ returns the descriptor for the given synonym in the diven language """
        assert(len(lang) == 2)
        try:
            result = self.getWikipediaSearchResults(synonym, lang)
            return result[0]
        except (HTTPError, IndexError):
            return None

    def getWikipediaSearchResults(self, term, lang):
        """ returns a list of wikipedia search results for the given term 
            or None if nothing was found 
        """
        search_query = WIKIPEDIA_SEARCH_QUERY % (lang, quote(term))
        f = self.r.open(search_query)
        results = WikiPedia._parse_wikipedia_search_results(f.read())
        f.close()

        return results

    @staticmethod
    def _parse_wikipedia_search_results(text):
        result = []
        for line in text.split("\n"):
            # only consider lines containing search results
            if not "class='searchresult'" in line: continue

            (prefix, tmp) = line.split("title=\"", 1)
            (descriptor, suffix) = tmp.split("\"", 1)

            result.append(descriptor)

        return result
Пример #34
0
    def analyze(self, text, content_type="text/txt"):
        """ Submits 'text' to OpenCalais for analysis and memorizes the extracted metadata. 
            Set the content-type to 'text/html' if you are submitting HTML data.  
        """
        externalID = self.content_id(text)
        paramsXML = PARAMS_XML % (content_type, self.allow_distro,
                                  self.allow_search, externalID,
                                  self.submitter)
        param = urlencode({
            'licenseID': self.api_key,
            'content': text,
            'paramsXML': paramsXML
        })

        # do not fetch the data again, if a file exists in the cache
        get_calais_data = lambda x: Retrieve(Calais.__name__).open(
            OPENCALAIS_URL, x).read()

        if self.cache is None:
            xml_data = self.unpack(get_calais_data(param))
        else:
            xml_data = self.unpack(self.cache.fetch(get_calais_data, param))

        return self.parse(xml_data)
Пример #35
0
class WeblyzardDictionaries(object):

    def __init__(self, user, password,
                 local_dir=LOCAL_DIR,
                 server_url=SERVER_URL,
                 max_age_hours=MAX_AGE_HOURS):

        if not os.path.exists(local_dir):
            os.makedirs(local_dir)
        self.max_file_age = datetime.now() - timedelta(hours=max_age_hours)
        self.local_dir = local_dir
        self.server_url = server_url
        self.retrieve = Retrieve(__file__)
        self.user = user
        self.password = password

    @staticmethod
    def is_online(server_url):
        '''
        Checks, whether the given url is online.

        :param server_url: \
            the url to check.

        :returns:
            True, if the dictionary server is online/reachable.
        '''
        hostname = urlparse.urlsplit(server_url).netloc
        try:
            gethostbyname(hostname)
            return True
        except gaierror:
            return False

    def get_dictionary(self, dictionary_uri):
        ''' tries to load the dictionary from the file-system. If the function
        cannot find the file or if the file is too old (see MAX_AGE_HOURS), 
        the function will load the dictionary from the server.
        :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt
        :returns: full file name of the dictionary
        '''

        if dictionary_uri.startswith('/'):
            dictionary_uri = dictionary_uri[1:]

        full_path = os.path.join(self.local_dir, dictionary_uri)

        # skip retrieval, if the server is not available
        if not self.is_online(SERVER_URL):
            return full_path

        fetch_file = True

        if os.path.isfile(full_path):
            last_mod = datetime.fromtimestamp(os.path.getmtime(full_path))

            if last_mod < self.max_file_age:
                last_mod_server = self.get_last_mod_date(dictionary_uri)

                if last_mod_server < last_mod:
                    fetch_file = False
            else:
                fetch_file = False

        if fetch_file:
            self.get_from_server(dictionary_uri, full_path)

        return full_path

    def get_last_mod_date(self, dictionary_uri):
        ''' Requests the URL with a HEAD request to retrieve the last_modified 
        date of the file
        :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt
        '''

        full_url = urlparse.urljoin(self.server_url, dictionary_uri)
        response = self.retrieve.open(full_url,
                                      user=self.user,
                                      pwd=self.password,
                                      accept_gzip=False,
                                      head_only=True)
        last_modified = response.headers.get('Last-Modified')

        if last_modified:
            return datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z')

    def get_from_server(self, dictionary_uri, target_path):
        ''' Fetches a dictionary from the server and stores it on the local FS.
        :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt
        :param target_path: destination on local FS to store the file
        :returns: target_path if the file was saved
        '''

        full_url = urlparse.urljoin(self.server_url, dictionary_uri)
        response = self.retrieve.open(full_url,
                                      user=self.user,
                                      pwd=self.password)

        if response:
            target_directory = os.path.dirname(target_path)

            if not os.path.exists(target_directory):
                os.makedirs(target_directory)

            with open(target_path, 'w') as f:
                f.write(response.read())

            return target_path
Пример #36
0
class AmazonWS(object):
    """ This class provides low level amazon web service access """

    def __init__(self, location='us', key=None):
        """ init """
        assert (location in AMAZON_LOCATIONS)
        self.retrieve = Retrieve(self.__class__.__name__)
        self.wsBase = AMAZON_LOCATIONS[location]
        self.accessKey = key or AMAZON_ACCESS_KEY
        self.amazon_url = AmazonUrl()

    def generateWsUrl(self, arguments):
        """ generates a valid amazon webservice request url """
        argList = ["%s&SubscriptionId=%s" % (
            self.wsBase, self.accessKey)] + ["%s=%s" % (k, quote(v)) for k, v in arguments.items()]
        return "&".join(argList)

    def generateSignedWsUrl(self, **arguments):
        """ generates a valid amazon webservice request url """
        #argList = [ "%s&SubscriptionId=%s" % (self.wsBase, self.accessKey) ] + [ "%s=%s" % (k,quote(v)) for k,v in arguments.items() ]
        # return "&".join(argList)
        return self.amazon_url.get_request_url(arguments)

    def query(self, arguments):
        """ retrieves a result from amazon webservice """
        url = self.generateWsUrl(arguments)

        done = False
        while not done:
            try:
                f = self.retrieve.open(url)
                res = f.read()
                self._write_debug_data(res)
                f.close()
                done = True
            except ValueError:
                logging.warning(
                    "Exception webservice query - waiting for %d seconds...\n" % ERROR_SLEEP_TIME)
                time.sleep(ERROR_SLEEP_TIME)
        return res

        @staticmethod
        def _write_debug_data(data):
            """ writes the given data to the debug file, if specified """
            if not AMAZON_DEBUG_FILE:
                return

            d = open(AMAZON_DEBUG_FILE, "a")
            d.write(data)
            d.close()

    def searchItem(self, searchIndex='Books', **param):
        """ searches an item in the amazon product repository """
        arguments = {'Operation': 'ItemSearch',
                     'SearchIndex': searchIndex,
                     'BrowseNode': '1000',
                     'Sort': 'salesrank',
                     'ResponseGroup': 'SalesRank,Small'}

        arguments.update(param)
        return self.query(arguments)

    def queryReview(self, itemId, **param):
        """ queries customers reviews to the selected Item """
        arguments = {'Operation': 'ItemLookup',
                     'ResponseGroup': 'Reviews',
                     'ItemId': itemId}
        arguments.update(param)
        return self.query(arguments)

        def newReleases(self, **param):
            """ returns a list of asins of new releases """
            arguments = {'Operation': 'BrowseNodeLookup',
                         'ResponseGroup': 'NewReleases',
                         'Marketplace': 'us'}

            arguments.update(param)
            return self.query(arguments)

        def itemAttributes(self, item_id, **param):
            """ returns all item attribues """
            arguments = {'Operation': 'ItemLookup',
                         'ItemId': item_id,
                         'IdType': 'ASIN',
                         'ResponseGroup': 'ItemAttributes,SalesRank'}
            arguments.update(param)
            return self.query(arguments)
Пример #37
0
class WebOfTrust(object):
    
    def __init__(self, api_key, service_url=SERVICE_URL):
        self.api_key = api_key
        self.service_url = service_url
        self.retrieve = Retrieve('eWRT.ws.wot')
        
    def get_reputation(self, hosts): 
        query={'hosts': self._encode_hosts(hosts),
               'api_key': self.api_key}
        
        urlObj = self.retrieve.open(self.service_url % query)
        
        if not urlObj:
            raise Exception('got no result')
        
        return self._format_result(json.loads(urlObj.read())) 
        
    @classmethod
    def _encode_hosts(cls, hosts):
        ''' 
        >>> WebOfTrust._encode_hosts(['http://wu.ac.at', 'https://wu.ac.at'])
        'wu.ac.at/'
        >>> WebOfTrust._encode_hosts(['wu.ac.at', 'https://modul.ac.at/'])
        'wu.ac.at/modul.ac.at/'
        '''
        if isinstance(hosts, string_types):
            hosts = [hosts]
        
        selected_hosts = []
        
        for host in hosts:
            
            if not host.startswith('http'):
                host = 'http://%s' % host
            netloc = '%s/' % quote(urlparse(host).netloc)
            
            if not netloc in selected_hosts: 
                selected_hosts.append(netloc)
        
        assert len(hosts) <= MAX_HOSTS, 'too many hosts (max: %s)!' % MAX_HOSTS
        return ''.join(selected_hosts)
    
    @classmethod
    def _encode_url(cls, service_url, query):
        ''' encodes the url '''
        return service_url % query

    @classmethod
    def _format_result(cls, data):
        '''
        Formats the result using MAPPING. The components for the reputation 
        provide the reputation and confidence. See WOT Developer API for 
        details
        '''
        result = {}
        for host, reputation in data.iteritems():
            r = {}
            for attr_name, new_attr_name in MAPPING.iteritems():
                if attr_name in reputation:
                    r[new_attr_name] = reputation[attr_name]
            r['wot_link'] = WOT_LINK % r['target']
            result[host] = r
            
        return result
Пример #38
0
 def __init__(self):
     self.r = Retrieve( Yahoo.__name__, sleep_time=0 )
Пример #39
0
 def __init__(self):
     self.r = Retrieve( YahooTermExtractor.__name__ )
Пример #40
0
 def __init__(self, api_key, service_url=SERVICE_URL):
     self.api_key = api_key
     self.service_url = service_url
     self.retrieve = Retrieve('eWRT.ws.wot')
Пример #41
0
 def __init__(self):
     self.r = Retrieve( Yahoo.__name__, sleep_time=0 )
Пример #42
0
class FacebookWS(object):
    """ 
    @class FacebookWS
    class for fetching and storing the data of a user
    requires that the facebook API key and the facebook secret key are
    set in the configuration file. These can be retrieved from facebook
    """
    FB_OBJECT_TYPES = ['post', 'user', 'page', 'event', 'group', 'path']

    # Expires July 8, 2017
    # 9https://developers.facebook.com/docs/apps/changelog)
    GRAPH_API_VERSION = 'v2.3'

    retrieve = Retrieve('facebookWS')
    # added: class properties for storing searchTerm and searchType

    def __init__(self, term=None, objectType='all', since=None, limit=None):
        """ init """
        self.term = term

        if objectType == 'all':
            objectType = 'post'

        self.objectType = objectType

        if since and not isinstance(since, int):
            since = time.mktime(since.timetuple())

        self.since = since
        self.limit = limit

    @classmethod
    def search(cls, term, objectType="all"):
        '''
        searches for the given term 
        @param term: term to search
        @param objectType: objectType to search in (post, user, page, event, group)
        @return: search result
        '''
        args = {}
        result = []

        args['q'] = term

        if objectType in cls.FB_OBJECT_TYPES:
            args['type'] = objectType
            result = cls.makeRequest('search', args)
        elif objectType == 'all':
            # search all object types
            for obj_type in cls.FB_OBJECT_TYPES:
                args['type'] = obj_type
                result.extend(cls.makeRequest('search', args))
        else:
            raise ValueError('Illegal Object type %s' % (objectType))

        return result

    @classmethod
    def makeRequest(cls, path, args={}, maxDoc=None, method='get'):
        '''
        makes a request to the graph API
        @param path: path to query, e.g. feed of user/group/page 122222: 122222/feed
        @return: fetched data
        '''

        if not 'access_token' in args:
            # args['access_token'] = "b07f413baf9650d2363f1c8813ece6da" #very
            # unflexible, its hardcoded...
            args['access_token'] = FACEBOOK_ACCESS_KEY

        if method == 'post':
            args['method'] = 'POST'

        url = "https://graph.facebook.com/%s?%s" % (
            path, urllib.parse.urlencode(args))
        result = cls._requestURL(url, maxDoc)
        return result

    def getJsonListStructure(self):
        request = None
        args = {}
        args['q'] = self.term

        if isinstance(args['q'], str):
            args['q'] = args['q'].encode('utf-8')

        if self.since:
            args['since'] = int(self.since)

        if self.limit:
            args['limit'] = self.limit

        if self.objectType == 'path':
            args_string = ''

            if 'q' in args:
                del args['q']

            if len(args):
                args_string = '?%s' % urllib.parse.urlencode(args)

            request = {'method': "GET",
                       "relative_url": '%s/%s%s' % (self.GRAPH_API_VERSION,
                                                    self.term, args_string)}

        elif self.objectType in self.FB_OBJECT_TYPES:
            args['type'] = self.objectType
            request = {'method': "GET",
                       "relative_url": "/search?%s" % urllib.parse.urlencode(args)}

        elif self.objectType == 'all':
            for obj_type in self.FB_OBJECT_TYPES:
                if obj_type == 'path':
                    continue
                args['type'] = obj_type
                request = {'method': "GET",
                           "relative_url": "/search?" + urllib.parse.urlencode(args)}

        return request

    @classmethod
    def _requestURL(cls, url, maxDoc=None, result=None, tried=None):
        '''
        fetches the data for the give URL from the graph API
        @param url: valid graph-api-url
        @return: fetched data 
        '''

        if not result:
            result = []
        if not maxDoc:
            maxDoc = 1000
        if not tried:
            tried = False

        try:

            f = cls.retrieve.open(url)
            fetched = json.loads(f.read())
            tried = True
            logging.debug('processing url %s' % url)

            if isinstance(fetched, dict):

                if 'data' in fetched:
                    if not len(fetched['data']):
                        return result

                    result.extend(fetched['data'])

                    # process paging
                    if len(result) < maxDoc:
                        if 'paging' in fetched and 'previous' in fetched['paging']:
                            result = (cls._requestURL(fetched['paging']['previous'],
                                                      maxDoc, result))
                            print('After processing paging', len(result))

                else:
                    # profiles for example don't contain a data dictionary
                    result.append(fetched)
                    print('After appending fetched', len(result))

        except HTTPError as e:
            print('Error: Bad Request for url %s: %s' % (url, e))
            if not tried:
                result = cls._requestURL(url, maxDoc, result, True)
        except (timeout, URLError) as e:
            print('URLError for url %s: %s' % (url, e))
            if not tried:
                result = cls._requestURL(url, maxDoc, result, True)

        return result
Пример #43
0
 def __init__(self, api_key, api_url=API_URL):
     ''' Constructor      '''
     WebDataSource.__init__(self)
     self.api_key = api_key
     self.api_url = api_url
     self.retrieve = Retrieve('google-plus')
Пример #44
0
class GooglePlus(object):
    '''
    classdocs
    '''
    def __init__(self, api_key, api_url=API_URL):
        ''' Constructor      '''
        WebDataSource.__init__(self)
        self.api_key = api_key
        self.api_url = api_url
        self.retrieve = Retrieve('google-plus')

    def search(self, search_terms, max_results=DEFAULT_MAX_RESULTS):
        ''' searches Google+ for the given search_terms 
        :param search_terms: search terms
        :type search_terms: list
        :param max_results: maximum number of result
        :type max_results: int
        :returns: generator with the result
        '''
        for search_term in search_terms:
            if isinstance(search_term, str):
                search_term = search_term.encode('utf-8')
            params = {
                'query': '"%s"' % search_term,
                'orderBy': DEFAULT_ORDER_BY,
                'maxResults': max_results
            }

            fetched = self.make_request(params, 'activities')

            for item in fetched['items']:
                try:
                    yield self.convert_item(item)
                except Exception as e:
                    logger.info('Error %s occured' % e)
                    continue

    def get_activity(self, activity_id):
        ''' returns the activity with the given ID
        :param activity_id: GooglePlus activity ID
        :type activity_id: string
        :returns: mapped result
        :rtype: dict
        '''
        item = self.make_request(path='activities/%s' % activity_id)
        return self.convert_item(item)

    def make_request(self, params=None, path='activities'):
        ''' executes the request to GooglePlus
        :param params: paremeters for the query
        :type params: list or None
        :param path: path to query, e.g. activities
        :type path: string
        :returns: GooglePlus result
        :rtype: dict
        '''
        url = self.get_request_url(params, path)
        data = self.retrieve.open(url)
        return json.load(data)

    def get_request_url(self, params=None, path='activities'):
        ''' returns a correctly parsed request URL 
        :param params: paremeters for the query
        :type params: list or None
        :param path: path to query, e.g. activities
        :type path: string
        :returns: GooglePlus request URL
        :rtype: str

        Usage: 
            >>> plus = GooglePlus('abcd')
            >>> plus.get_request_url()
            'https://www.googleapis.com/plus/v1/activities?key=abcd'
        '''
        params = params if params else {}

        if not 'key' in params:
            params['key'] = self.api_key

        if 'maxResults' in params and params[
                'maxResults'] > DEFAULT_MAX_RESULTS:
            params['maxResults'] = DEFAULT_MAX_RESULTS

        return self.api_url.format(path=path, query=urlencode(params))

    @classmethod
    def convert_item(cls, item):
        ''' applies a mapping to convert the result to the required format
        :param item: GooglePlus Activity
        :type item: dict
        :rtype: dict
        '''

        last_modified = datetime.strptime(item['updated'],
                                          '%Y-%m-%dT%H:%M:%S.%fZ')
        published = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ')

        content = cls.convert_content(item['object']['content'])

        if not item['verb'] == 'post':
            raise Exception('Skipping activity of type "%s"' % item['verb'])

        if not len(content):
            logger.info('Skipping "%s" -> content is empty' % item['title'])
            raise Exception('content is empty')

        if 'attachments' in item['object']:
            for attachment in item['object']['attachments']:
                if attachment['objectType'] == 'article':
                    if not 'content' in attachment:
                        raise Exception('no content in attachment')

                    content = '%s\n"%s" (%s)' % (
                        content, cls.convert_content(
                            attachment['content']), attachment['url'])

        activity = {
            'content': content,
            'title': item['actor']['displayName'],
            'url': item['url'],
            'last_modified': last_modified,
            'user_id': item['actor']['id'],
            'user_img_url': item['actor']['image']['url'],
            'screen_name': item['actor']['displayName'],
            'encoding': u'utf-8',
            'user_url': item['actor']['url'],
            'valid_from': published,
            'reshares': item['object']['resharers']['totalItems'],
            'plusoners': item['object']['plusoners']['totalItems'],
            'activity_id': item['id'],
        }

        if 'geocode' in activity:
            activity['geocode'] = item['geocode']

        return activity
Пример #45
0
class AmazonWS(object):
    """ This class provides low level amazon web service access """
    def __init__(self, location='us', key=None):
        """ init """
        assert (location in AMAZON_LOCATIONS)
        self.retrieve = Retrieve(self.__class__.__name__)
        self.wsBase = AMAZON_LOCATIONS[location]
        self.accessKey = key or AMAZON_ACCESS_KEY
        self.amazon_url = AmazonUrl()

    def generateWsUrl(self, arguments):
        """ generates a valid amazon webservice request url """
        argList = ["%s&SubscriptionId=%s" % (self.wsBase, self.accessKey)] + [
            "%s=%s" % (k, quote(v)) for k, v in list(arguments.items())
        ]
        return "&".join(argList)

    def generateSignedWsUrl(self, **arguments):
        """ generates a valid amazon webservice request url """
        #argList = [ "%s&SubscriptionId=%s" % (self.wsBase, self.accessKey) ] + [ "%s=%s" % (k,quote(v)) for k,v in arguments.items() ]
        # return "&".join(argList)
        return self.amazon_url.get_request_url(arguments)

    def query(self, arguments):
        """ retrieves a result from amazon webservice """
        url = self.generateWsUrl(arguments)

        done = False
        while not done:
            try:
                f = self.retrieve.open(url)
                res = f.read()
                self._write_debug_data(res)
                f.close()
                done = True
            except ValueError:
                logging.warning(
                    "Exception webservice query - waiting for %d seconds...\n"
                    % ERROR_SLEEP_TIME)
                time.sleep(ERROR_SLEEP_TIME)
        return res

        @staticmethod
        def _write_debug_data(data):
            """ writes the given data to the debug file, if specified """
            if not AMAZON_DEBUG_FILE:
                return

            d = open(AMAZON_DEBUG_FILE, "a")
            d.write(data)
            d.close()

    def searchItem(self, searchIndex='Books', **param):
        """ searches an item in the amazon product repository """
        arguments = {
            'Operation': 'ItemSearch',
            'SearchIndex': searchIndex,
            'BrowseNode': '1000',
            'Sort': 'salesrank',
            'ResponseGroup': 'SalesRank,Small'
        }

        arguments.update(param)
        return self.query(arguments)

    def queryReview(self, itemId, **param):
        """ queries customers reviews to the selected Item """
        arguments = {
            'Operation': 'ItemLookup',
            'ResponseGroup': 'Reviews',
            'ItemId': itemId
        }
        arguments.update(param)
        return self.query(arguments)

        def newReleases(self, **param):
            """ returns a list of asins of new releases """
            arguments = {
                'Operation': 'BrowseNodeLookup',
                'ResponseGroup': 'NewReleases',
                'Marketplace': 'us'
            }

            arguments.update(param)
            return self.query(arguments)

        def itemAttributes(self, item_id, **param):
            """ returns all item attribues """
            arguments = {
                'Operation': 'ItemLookup',
                'ItemId': item_id,
                'IdType': 'ASIN',
                'ResponseGroup': 'ItemAttributes,SalesRank'
            }
            arguments.update(param)
            return self.query(arguments)
Пример #46
0
 def __init__(self, api_key, api_url=API_URL):
     ''' Constructor      '''
     WebDataSource.__init__(self)
     self.api_key = api_key
     self.api_url = api_url
     self.retrieve = Retrieve('google-plus')
Пример #47
0
class GooglePlus(object):
    '''
    classdocs
    '''

    def __init__(self, api_key, api_url=API_URL):
        ''' Constructor      '''
        WebDataSource.__init__(self)
        self.api_key = api_key
        self.api_url = api_url
        self.retrieve = Retrieve('google-plus')

    def search(self, search_terms, max_results=DEFAULT_MAX_RESULTS):
        ''' searches Google+ for the given search_terms 
        :param search_terms: search terms
        :type search_terms: list
        :param max_results: maximum number of result
        :type max_results: int
        :returns: generator with the result
        '''
        for search_term in search_terms:
            if isinstance(search_term, unicode):
                search_term = search_term.encode('utf-8')
            params = {'query': '"%s"' % search_term,
                      'orderBy': DEFAULT_ORDER_BY,
                      'maxResults': max_results}

            fetched = self.make_request(params, 'activities')

            for item in fetched['items']:
                try:
                    yield self.convert_item(item)
                except Exception as e:
                    logger.info('Error %s occured' % e)
                    continue

    def get_activity(self, activity_id):
        ''' returns the activity with the given ID
        :param activity_id: GooglePlus activity ID
        :type activity_id: string
        :returns: mapped result
        :rtype: dict
        '''
        item = self.make_request(path='activities/%s' % activity_id)
        return self.convert_item(item)

    def make_request(self, params=None, path='activities'):
        ''' executes the request to GooglePlus
        :param params: paremeters for the query
        :type params: list or None
        :param path: path to query, e.g. activities
        :type path: string
        :returns: GooglePlus result
        :rtype: dict
        '''
        url = self.get_request_url(params, path)
        data = self.retrieve.open(url)
        return json.load(data)

    def get_request_url(self, params=None, path='activities'):
        ''' returns a correctly parsed request URL 
        :param params: paremeters for the query
        :type params: list or None
        :param path: path to query, e.g. activities
        :type path: string
        :returns: GooglePlus request URL
        :rtype: str

        Usage: 
            >>> plus = GooglePlus('abcd')
            >>> plus.get_request_url()
            'https://www.googleapis.com/plus/v1/activities?key=abcd'
        '''
        params = params if params else {}

        if not 'key' in params:
            params['key'] = self.api_key

        if 'maxResults' in params and params['maxResults'] > DEFAULT_MAX_RESULTS:
            params['maxResults'] = DEFAULT_MAX_RESULTS

        return self.api_url.format(path=path, query=urlencode(params))

    @classmethod
    def convert_item(cls, item):
        ''' applies a mapping to convert the result to the required format
        :param item: GooglePlus Activity
        :type item: dict
        :rtype: dict
        '''

        last_modified = datetime.strptime(item['updated'],
                                          '%Y-%m-%dT%H:%M:%S.%fZ')
        published = datetime.strptime(item['updated'],
                                      '%Y-%m-%dT%H:%M:%S.%fZ')

        content = cls.convert_content(item['object']['content'])

        if not item['verb'] == 'post':
            raise Exception('Skipping activity of type "%s"' % item['verb'])

        if not len(content):
            logger.info('Skipping "%s" -> content is empty' % item['title'])
            raise Exception('content is empty')

        if 'attachments' in item['object']:
            for attachment in item['object']['attachments']:
                if attachment['objectType'] == 'article':
                    if not 'content' in attachment:
                        raise Exception('no content in attachment')

                    content = '%s\n"%s" (%s)' % (content,
                                                 cls.convert_content(
                                                     attachment['content']),
                                                 attachment['url'])

        activity = {'content': content,
                    'title': item['actor']['displayName'],
                    'url': item['url'],
                    'last_modified': last_modified,
                    'user_id': item['actor']['id'],
                    'user_img_url': item['actor']['image']['url'],
                    'screen_name': item['actor']['displayName'],
                    'encoding': u'utf-8',
                    'user_url': item['actor']['url'],
                    'valid_from': published,
                    'reshares': item['object']['resharers']['totalItems'],
                    'plusoners': item['object']['plusoners']['totalItems'],
                    'activity_id': item['id'],
                    }

        if 'geocode' in activity:
            activity['geocode'] = item['geocode']

        return activity
Пример #48
0
 def __init__(self):
     self.r = Retrieve( WikiPedia.__name__ )
Пример #49
0
 def __init__(self):
     self.r = Retrieve(WikiPedia.__name__)