def __init__(self, location='us', key=None): """ init """ assert (location in AMAZON_LOCATIONS) self.retrieve = Retrieve(self.__class__.__name__) self.wsBase = AMAZON_LOCATIONS[location] self.accessKey = key or AMAZON_ACCESS_KEY self.amazon_url = AmazonUrl()
def getRelatedTags(tags): """ fetches the related tags for the given tags @param list of tags @return dictionary of related tags with count """ if type(tags).__name__ == 'str': url = Twitter.TWITTER_SEARCH_URL % tags else: url = Twitter.TWITTER_SEARCH_URL % "+".join(tags) f = Retrieve(Twitter.__name__).open(url) # convert json into dict and remove null values with "" search_results = eval(re.sub('null', '""', f.read())) found_tags = [] related_tags = {} for result in search_results['results']: found_tags.extend(Twitter.RE_FIND_TAGS.findall(result['text'])) for tag in found_tags: related_tags[tag.lower()] = related_tags.get(tag.lower(), 0) + 1 # todo: sort return related_tags
def getRelatedTags( tags ): """ fetches the related tags for the given tags @param list of tags @return dictionary of related tags with count """ if type(tags).__name__ == 'str': url = Twitter.TWITTER_SEARCH_URL % tags else: url = Twitter.TWITTER_SEARCH_URL % "+".join(tags) f = Retrieve(Twitter.__name__).open(url) # convert json into dict and remove null values with "" search_results = eval(re.sub('null', '""', f.read())) found_tags = [] related_tags = {} for result in search_results['results']: found_tags.extend(Twitter.RE_FIND_TAGS.findall( result['text'])) for tag in found_tags: related_tags[tag.lower()] = related_tags.get(tag.lower(), 0) + 1 # todo: sort return related_tags
def get_new_access_token(client_id=FACEBOOK_APPLICATION_ID, client_secret=FACEBOOK_SECRET_KEY, access_token=FACEBOOK_ACCESS_KEY): ''' ''' url = API_URL.format(client_id=client_id, client_secret=client_secret, access_token=access_token) retrieve = Retrieve('fb') x = retrieve.open(url) result = x.read() new_access_token = access_token for key, param in urlparse.parse_qs(result).iteritems(): print key, param if key == 'access_token': if isinstance(param, list): param = param[0] if param == access_token: print 'access token still the same' else: print 'got new access_token %s' % param new_access_token = param return new_access_token
def get_content(url): """ returns the content from Flickr """ assert(url.startswith("http")) f = Retrieve(Flickr.__name__).open(url) content = f.read() f.close() return content
def get_content(url): """ returns the content from Flickr """ assert (url.startswith("http")) f = Retrieve(Flickr.__name__).open(url) content = f.read() f.close() return content
def _get_content(url): """ returns the content from delicious """ assert(url.startswith("http")) f = Retrieve(Delicious.__name__).open(url) content = f.read() f.close() sleep(1) return content
def testRetrieval(self): ''' tries to retrieve the following url's from the list ''' r_handler = Retrieve(self.__class__.__name__) for url in self.TEST_URLS: print(url) r = r_handler.open(url) r.read() r.close()
def testRetrievalTimeout(self): ''' tests whether the socket timeout is honored by our class ''' SLOW_URL = "http://www.csse.uwa.edu.au/" with raises(urllib2.URLError): r = Retrieve(self.__class__.__name__, default_timeout=0.1).open(SLOW_URL) content = r.read() r.close()
def _get_content( url ): """ returns the content from delicious """ assert( url.startswith("http") ) f = Retrieve(Delicious.__name__).open(url) content = f.read() f.close() sleep(1) return content
def __init__(self, user, password, local_dir=LOCAL_DIR, server_url=SERVER_URL, max_age_hours=MAX_AGE_HOURS): if not os.path.exists(local_dir): os.makedirs(local_dir) self.max_file_age = datetime.now() - timedelta(hours=max_age_hours) self.local_dir = local_dir self.server_url = server_url self.retrieve = Retrieve(__file__) self.user = user self.password = password
def t_retrieve(url): ''' retrieves the given url from the web @remarks helper module for the testMultiProcessing unit test. ''' r = Retrieve(__name__).open(url) try: content = r.read() finally: # this is required as GzipFile does not support the context protocol # in python 2.6 r.close() return content
def fix_urls(cls, urls, user=None, password=None): ''' fixes the urls and put them into the correct format, to maintain the compability to the remaining platform :param urls: service urls :type urls: string or list or tuple :param user: username :param password: password :returns: correctly formated urls :rtype: list ''' correct_urls = [] if isinstance(urls, string_types): urls = [urls] for url in urls: if not url.endswith('/'): url = '%s/' % url if not 'rest' in url: if cls.URL_PATH and not url.endswith(cls.URL_PATH): if cls.URL_PATH.startswith('/'): cls.URL_PATH = cls.URL_PATH[1:] url = '%s%s' % (url, cls.URL_PATH) if user and password: url = Retrieve.add_user_password(url, user, password) correct_urls.append(url) return correct_urls
def __init__(self, service_url, user=None, password=None, authentification_method='basic', module_name='eWRT.REST', default_timeout=WS_DEFAULT_TIMEOUT): ''' :param service_url: the base url of the web service :param modul_name: the module name to add to the USER AGENT description (optional) :param user: username :param password: password :param authentification_method: authentification method to use ('basic'*, 'digest'). ''' # remove superfluous slashes, if required self.service_url = service_url[:-1] if service_url.endswith("/") \ else service_url self.user = user self.password = password if not default_timeout: default_timeout = WS_DEFAULT_TIMEOUT url_obj = Retrieve(module_name, sleep_time=0, default_timeout=default_timeout) self.retrieve = partial( url_obj.open, user=user, pwd=password, authentification_method=authentification_method)
def retrieve_conceptnet_query_result(query): ''' ::param url: the url to retrieve ::return: the json response to the given conceptnet query ''' with Retrieve(__name__) as r: c = r.open(query, retry=3) return c.read()
def get_content(url): """ returns the content from Technorati """ assert(url.startswith("http")) logger.debug('Fetching content for URL %s' % url) if (time.time() - Technorati.last_access) < SLEEP_TIME: logger.debug('Sleeping %s seconds!' % SLEEP_TIME) time.sleep(SLEEP_TIME) Technorati.last_access = time.time() f = Retrieve("%s_new" % Technorati.__name__).open(url) content = f.read() f.close() return content
def get_content(url, sleep_time=SLEEP_TIME): ''' fetches the content @param url: url to fetch @param sleep_time: time to sleep @return: HTML string''' assert (url.startswith("http")) return Retrieve("GoogleBlogSearch", sleep_time=sleep_time).open(url).read()
class WikiPedia(object): """ returns an WikiPedia Object """ def __init__(self): self.r = Retrieve(WikiPedia.__name__) def getWikiPage(self, pageName, lang='en'): """ returns the given wikipedia page considering different spellings @param[in] pageName @param[in] language (determines which wikipedia to query) @returns the page's wikipedia text """ assert (len(lang) == 2) for pn in self._getPageNameAlterations(pageName): pageContent = self._retrievePage(pn, lang) if pageContent: return pageContent return None @staticmethod def _getPageNameAlterations(pageName): """ @returns a list of differnt names for the given page """ alt = [ pageName, ] if not ' ' in pageName: alt words = pageName.split(" ") alt.append( "%s %s" % (words[0].capitalize(), " ".join(map(str.lower, words[1:])))) return alt def _retrievePage(self, pageName, lang): """ retrieves the given Wiki page @param[in] pageName @param[in] language (determines which wikipedia to query) @returns the page's wikipedia text """ param = urlencode({ 'action': 'query', 'format': 'json', 'export': '', 'redirects': 'true', 'titles': pageName }) data = self.r.open(WIKIPEDIA_API_QUERY % lang, param).read() jsonData = eval(data)['query'] if '-1' in jsonData['pages']: return None xmlData = jsonData['export']['*'].replace("\/", "/") return parseString(xmlData).getElementsByTagName( 'text')[0].firstChild.data
def testGettingUserPassword(self): urls = (('http://irgendwas.com', None, None), ('http://*****:*****@irgendwas.com', 'heinz', 'secret')) for test_url, exp_user, exp_passwd in urls: print('testing url ' + test_url) url, user, passwd = Retrieve.get_user_password(test_url) assert user == exp_user assert passwd == exp_passwd if user: assert url != test_url
def parse(url, last_modified=None): """ Parses the given RSS Feed an returns all articles and the content of the page referenced in the <link> tag. @param url: the url of the rss feed @param last_modified: a datetime object that specifies the last time the feed has been queried the last time (only newer entries are returned). """ feed = feedparser.parse(url, modified=last_modified) retrieve = Retrieve("rss", HTTP_FETCH_DELAY) result = [] for item in feed['items']: if datetime.fromtimestamp(mktime( item['updated_parsed'])) > last_modified: item['content'] = retrieve.open(item['link']).read() result.append(item) return result
def getNeighbors(geo_entity): """ returns all neighbours for the given geo id (currently only implemented on a country level) @param[in] geo_entity @returns a list containing the neighbours of the given country """ url = GeoNames.NEIGHBOURS_SERVICE_URL % geo_entity.id jsonData = eval(Retrieve('eWRT.ws.geonames').open(url, retry=5).read()) if 'geonames' in jsonData: return filter(None, [GeoNames.getGeoEntity(GeoEntity.factory(id=e['geonameId'])) for e in jsonData['geonames']]) else: return []
def parse(url, last_modified=None): """ Parses the given RSS Feed an returns all articles and the content of the page referenced in the <link> tag. @param url: the url of the rss feed @param last_modified: a datetime object that specifies the last time the feed has been queried the last time (only newer entries are returned). """ feed = feedparser.parse(url, modified=last_modified) retrieve = Retrieve("rss", HTTP_FETCH_DELAY) result = [] for item in feed['items']: if datetime.fromtimestamp( mktime(item['updated_parsed'])) > last_modified: item['content'] = retrieve.open(item['link']).read() result.append(item) return result
class WikiPedia(object): """ returns an WikiPedia Object """ def __init__(self): self.r = Retrieve( WikiPedia.__name__ ) def getWikiPage(self, pageName, lang='en'): """ returns the given wikipedia page considering different spellings @param[in] pageName @param[in] language (determines which wikipedia to query) @returns the page's wikipedia text """ assert( len(lang)==2 ) for pn in self._getPageNameAlterations( pageName ): pageContent = self._retrievePage( pn, lang ) if pageContent: return pageContent return None @staticmethod def _getPageNameAlterations(pageName): """ @returns a list of differnt names for the given page """ alt = [ pageName, ] if not ' ' in pageName: alt words = pageName.split(" ") alt.append( "%s %s" % (words[0].capitalize(), " ".join( map(str.lower, words[1:] ) )) ) return alt def _retrievePage(self, pageName, lang): """ retrieves the given Wiki page @param[in] pageName @param[in] language (determines which wikipedia to query) @returns the page's wikipedia text """ param = urlencode( {'action': 'query', 'format':'json', 'export':'', 'redirects':'true', 'titles':pageName }) data = self.r.open( WIKIPEDIA_API_QUERY % lang, param ).read() jsonData = eval( data )['query'] if '-1' in jsonData['pages']: return None xmlData = jsonData['export']['*'].replace("\/","/") return parseString( xmlData ).getElementsByTagName('text')[0].firstChild.data
def _connect_clients(cls, service_urls, user=None, password=None, default_timeout=WS_DEFAULT_TIMEOUT): clients = [] if isinstance(service_urls, str): service_urls = [service_urls] for url in service_urls: service_url, user, password = Retrieve.get_user_password(url) clients.append(RESTClient(service_url=service_url, user=user, password=password, default_timeout=default_timeout)) return clients
def _connect_clients(cls, service_urls, user=None, password=None, default_timeout=WS_DEFAULT_TIMEOUT): clients = [] if isinstance(service_urls, str): service_urls = [service_urls] for url in service_urls: service_url, new_user, new_password = Retrieve.get_user_password(url) clients.append(RESTClient(service_url=service_url, user=user or new_user, password=password or new_password, default_timeout=default_timeout)) return clients
class Yahoo(TagInfoService): """ interfaces with yahoo's search service * Search: Yahoo! BOSS (see http://developer.yahoo.com/search/boss) """ __slots__ = ('r', ) def __init__(self): self.r = Retrieve( Yahoo.__name__, sleep_time=0 ) def query(self, terms, count=0, queryParams={} ): """ returns search results for the given terms @param[in] terms ... a list of search terms @param[in] count ... number of results to return (0 if we are interested on the search meta data only). @param[in] queryParams ... a dictionary of query parameters to add to the request @returns the search results """ assert ( isinstance(terms, tuple) or isinstance(terms, list) ) queryParams.update( {'appid': YAHOO_APP_ID, 'count': count, 'format': 'json' } ) params = urlencode( queryParams ) url = YAHOO_SEARCH_URL % "%2B".join(map( quote, terms) ) +"?"+ params print(url) try: result = eval( self.r.open(url).read().replace("\\/", "/" )) return result['ysearchresponse'] except (timeout, URLError): return "" @staticmethod def getSearchResults(query_result): """ returns a list of all search results returned by the given query result. @param[in] query_result Result of the query """ return [ YahooSearchResult(r) for r in query_result['resultset_web'] ] \ if 'resultset_web' in query_result else [] def getTagInfo(self, tag): """ @Override """ return int( self.query(tag)['totalhits'] )
class Yahoo(TagInfoService): """ interfaces with yahoo's search service * Search: Yahoo! BOSS (see http://developer.yahoo.com/search/boss) """ __slots__ = ('r', ) def __init__(self): self.r = Retrieve( Yahoo.__name__, sleep_time=0 ) def query(self, terms, count=0, queryParams={} ): """ returns search results for the given terms @param[in] terms ... a list of search terms @param[in] count ... number of results to return (0 if we are interested on the search meta data only). @param[in] queryParams ... a dictionary of query parameters to add to the request @returns the search results """ assert ( isinstance(terms, tuple) or isinstance(terms, list) ) queryParams.update( {'appid': YAHOO_APP_ID, 'count': count, 'format': 'json' } ) params = urlencode( queryParams ) url = YAHOO_SEARCH_URL % "%2B".join(map( quote, terms) ) +"?"+ params print url try: result = eval( self.r.open(url).read().replace("\\/", "/" )) return result['ysearchresponse'] except URLError: return "" @staticmethod def getSearchResults(query_result): """ returns a list of all search results returned by the given query result. @param[in] query_result Result of the query """ return [ YahooSearchResult(r) for r in query_result['resultset_web'] ] \ if 'resultset_web' in query_result else [] def getTagInfo(self, tag): """ @Override """ return int( self.query(tag)['totalhits'] )
class YahooTermExtractor(object): """ interfaces with yahoo's search service * Term extraction: extract terms from yahoo search http://developer.yahoo.com/search/content/V1/termExtraction.html """ __slots__ = ('r', ) def __init__(self): self.r = Retrieve( YahooTermExtractor.__name__ ) def extractTerms(self, content): """ extract terms from yahoo search, see http://developer.yahoo.com/search/content/V1/termExtraction.html """ params = urlencode( {'appid': YAHOO_APP_ID, 'context': content, 'output': 'json' }) result = eval ( self.r.open(YAHOO_TERM_EXTRACTION_URI, params).read() ) return result['ResultSet']['Result']
class WikiPedia(object): """ returns a wikipedia article """ def __init__(self): self.r = Retrieve( WikiPedia.__name__ ) def getDescriptor(self, synonym, lang='en'): """ returns the descriptor for the given synonym in the diven language """ assert( len(lang)==2 ) try: result = self.getWikipediaSearchResults(synonym, lang) return result[0] except (HTTPError, IndexError): return None def getWikipediaSearchResults(self, term, lang): """ returns a list of wikipedia search results for the given term or None if nothing was found """ search_query = WIKIPEDIA_SEARCH_QUERY % (lang, quote(term) ) f=self.r.open(search_query) results = WikiPedia._parse_wikipedia_search_results( f.read() ) f.close() return results @staticmethod def _parse_wikipedia_search_results( text ): result = [] for line in text.split("\n"): # only consider lines containing search results if not "class='searchresult'" in line: continue (prefix, tmp) = line.split("title=\"", 1) (descriptor, suffix ) = tmp.split("\"", 1) result.append(descriptor) return result
class WikiPedia(object): """ returns a wikipedia article """ def __init__(self): self.r = Retrieve(WikiPedia.__name__) def getDescriptor(self, synonym, lang='en'): """ returns the descriptor for the given synonym in the diven language """ assert(len(lang) == 2) try: result = self.getWikipediaSearchResults(synonym, lang) return result[0] except (HTTPError, IndexError): return None def getWikipediaSearchResults(self, term, lang): """ returns a list of wikipedia search results for the given term or None if nothing was found """ search_query = WIKIPEDIA_SEARCH_QUERY % (lang, quote(term)) f = self.r.open(search_query) results = WikiPedia._parse_wikipedia_search_results(f.read()) f.close() return results @staticmethod def _parse_wikipedia_search_results(text): result = [] for line in text.split("\n"): # only consider lines containing search results if not "class='searchresult'" in line: continue (prefix, tmp) = line.split("title=\"", 1) (descriptor, suffix) = tmp.split("\"", 1) result.append(descriptor) return result
def analyze(self, text, content_type="text/txt"): """ Submits 'text' to OpenCalais for analysis and memorizes the extracted metadata. Set the content-type to 'text/html' if you are submitting HTML data. """ externalID = self.content_id(text) paramsXML = PARAMS_XML % (content_type, self.allow_distro, self.allow_search, externalID, self.submitter) param = urlencode({ 'licenseID': self.api_key, 'content': text, 'paramsXML': paramsXML }) # do not fetch the data again, if a file exists in the cache get_calais_data = lambda x: Retrieve(Calais.__name__).open( OPENCALAIS_URL, x).read() if self.cache is None: xml_data = self.unpack(get_calais_data(param)) else: xml_data = self.unpack(self.cache.fetch(get_calais_data, param)) return self.parse(xml_data)
class WeblyzardDictionaries(object): def __init__(self, user, password, local_dir=LOCAL_DIR, server_url=SERVER_URL, max_age_hours=MAX_AGE_HOURS): if not os.path.exists(local_dir): os.makedirs(local_dir) self.max_file_age = datetime.now() - timedelta(hours=max_age_hours) self.local_dir = local_dir self.server_url = server_url self.retrieve = Retrieve(__file__) self.user = user self.password = password @staticmethod def is_online(server_url): ''' Checks, whether the given url is online. :param server_url: \ the url to check. :returns: True, if the dictionary server is online/reachable. ''' hostname = urlparse.urlsplit(server_url).netloc try: gethostbyname(hostname) return True except gaierror: return False def get_dictionary(self, dictionary_uri): ''' tries to load the dictionary from the file-system. If the function cannot find the file or if the file is too old (see MAX_AGE_HOURS), the function will load the dictionary from the server. :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt :returns: full file name of the dictionary ''' if dictionary_uri.startswith('/'): dictionary_uri = dictionary_uri[1:] full_path = os.path.join(self.local_dir, dictionary_uri) # skip retrieval, if the server is not available if not self.is_online(SERVER_URL): return full_path fetch_file = True if os.path.isfile(full_path): last_mod = datetime.fromtimestamp(os.path.getmtime(full_path)) if last_mod < self.max_file_age: last_mod_server = self.get_last_mod_date(dictionary_uri) if last_mod_server < last_mod: fetch_file = False else: fetch_file = False if fetch_file: self.get_from_server(dictionary_uri, full_path) return full_path def get_last_mod_date(self, dictionary_uri): ''' Requests the URL with a HEAD request to retrieve the last_modified date of the file :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt ''' full_url = urlparse.urljoin(self.server_url, dictionary_uri) response = self.retrieve.open(full_url, user=self.user, pwd=self.password, accept_gzip=False, head_only=True) last_modified = response.headers.get('Last-Modified') if last_modified: return datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S %Z') def get_from_server(self, dictionary_uri, target_path): ''' Fetches a dictionary from the server and stores it on the local FS. :param dictionary_uri: URI for the dictionary, e.g. people/de/titles/all.txt :param target_path: destination on local FS to store the file :returns: target_path if the file was saved ''' full_url = urlparse.urljoin(self.server_url, dictionary_uri) response = self.retrieve.open(full_url, user=self.user, pwd=self.password) if response: target_directory = os.path.dirname(target_path) if not os.path.exists(target_directory): os.makedirs(target_directory) with open(target_path, 'w') as f: f.write(response.read()) return target_path
class AmazonWS(object): """ This class provides low level amazon web service access """ def __init__(self, location='us', key=None): """ init """ assert (location in AMAZON_LOCATIONS) self.retrieve = Retrieve(self.__class__.__name__) self.wsBase = AMAZON_LOCATIONS[location] self.accessKey = key or AMAZON_ACCESS_KEY self.amazon_url = AmazonUrl() def generateWsUrl(self, arguments): """ generates a valid amazon webservice request url """ argList = ["%s&SubscriptionId=%s" % ( self.wsBase, self.accessKey)] + ["%s=%s" % (k, quote(v)) for k, v in arguments.items()] return "&".join(argList) def generateSignedWsUrl(self, **arguments): """ generates a valid amazon webservice request url """ #argList = [ "%s&SubscriptionId=%s" % (self.wsBase, self.accessKey) ] + [ "%s=%s" % (k,quote(v)) for k,v in arguments.items() ] # return "&".join(argList) return self.amazon_url.get_request_url(arguments) def query(self, arguments): """ retrieves a result from amazon webservice """ url = self.generateWsUrl(arguments) done = False while not done: try: f = self.retrieve.open(url) res = f.read() self._write_debug_data(res) f.close() done = True except ValueError: logging.warning( "Exception webservice query - waiting for %d seconds...\n" % ERROR_SLEEP_TIME) time.sleep(ERROR_SLEEP_TIME) return res @staticmethod def _write_debug_data(data): """ writes the given data to the debug file, if specified """ if not AMAZON_DEBUG_FILE: return d = open(AMAZON_DEBUG_FILE, "a") d.write(data) d.close() def searchItem(self, searchIndex='Books', **param): """ searches an item in the amazon product repository """ arguments = {'Operation': 'ItemSearch', 'SearchIndex': searchIndex, 'BrowseNode': '1000', 'Sort': 'salesrank', 'ResponseGroup': 'SalesRank,Small'} arguments.update(param) return self.query(arguments) def queryReview(self, itemId, **param): """ queries customers reviews to the selected Item """ arguments = {'Operation': 'ItemLookup', 'ResponseGroup': 'Reviews', 'ItemId': itemId} arguments.update(param) return self.query(arguments) def newReleases(self, **param): """ returns a list of asins of new releases """ arguments = {'Operation': 'BrowseNodeLookup', 'ResponseGroup': 'NewReleases', 'Marketplace': 'us'} arguments.update(param) return self.query(arguments) def itemAttributes(self, item_id, **param): """ returns all item attribues """ arguments = {'Operation': 'ItemLookup', 'ItemId': item_id, 'IdType': 'ASIN', 'ResponseGroup': 'ItemAttributes,SalesRank'} arguments.update(param) return self.query(arguments)
class WebOfTrust(object): def __init__(self, api_key, service_url=SERVICE_URL): self.api_key = api_key self.service_url = service_url self.retrieve = Retrieve('eWRT.ws.wot') def get_reputation(self, hosts): query={'hosts': self._encode_hosts(hosts), 'api_key': self.api_key} urlObj = self.retrieve.open(self.service_url % query) if not urlObj: raise Exception('got no result') return self._format_result(json.loads(urlObj.read())) @classmethod def _encode_hosts(cls, hosts): ''' >>> WebOfTrust._encode_hosts(['http://wu.ac.at', 'https://wu.ac.at']) 'wu.ac.at/' >>> WebOfTrust._encode_hosts(['wu.ac.at', 'https://modul.ac.at/']) 'wu.ac.at/modul.ac.at/' ''' if isinstance(hosts, string_types): hosts = [hosts] selected_hosts = [] for host in hosts: if not host.startswith('http'): host = 'http://%s' % host netloc = '%s/' % quote(urlparse(host).netloc) if not netloc in selected_hosts: selected_hosts.append(netloc) assert len(hosts) <= MAX_HOSTS, 'too many hosts (max: %s)!' % MAX_HOSTS return ''.join(selected_hosts) @classmethod def _encode_url(cls, service_url, query): ''' encodes the url ''' return service_url % query @classmethod def _format_result(cls, data): ''' Formats the result using MAPPING. The components for the reputation provide the reputation and confidence. See WOT Developer API for details ''' result = {} for host, reputation in data.iteritems(): r = {} for attr_name, new_attr_name in MAPPING.iteritems(): if attr_name in reputation: r[new_attr_name] = reputation[attr_name] r['wot_link'] = WOT_LINK % r['target'] result[host] = r return result
def __init__(self): self.r = Retrieve( Yahoo.__name__, sleep_time=0 )
def __init__(self): self.r = Retrieve( YahooTermExtractor.__name__ )
def __init__(self, api_key, service_url=SERVICE_URL): self.api_key = api_key self.service_url = service_url self.retrieve = Retrieve('eWRT.ws.wot')
class FacebookWS(object): """ @class FacebookWS class for fetching and storing the data of a user requires that the facebook API key and the facebook secret key are set in the configuration file. These can be retrieved from facebook """ FB_OBJECT_TYPES = ['post', 'user', 'page', 'event', 'group', 'path'] # Expires July 8, 2017 # 9https://developers.facebook.com/docs/apps/changelog) GRAPH_API_VERSION = 'v2.3' retrieve = Retrieve('facebookWS') # added: class properties for storing searchTerm and searchType def __init__(self, term=None, objectType='all', since=None, limit=None): """ init """ self.term = term if objectType == 'all': objectType = 'post' self.objectType = objectType if since and not isinstance(since, int): since = time.mktime(since.timetuple()) self.since = since self.limit = limit @classmethod def search(cls, term, objectType="all"): ''' searches for the given term @param term: term to search @param objectType: objectType to search in (post, user, page, event, group) @return: search result ''' args = {} result = [] args['q'] = term if objectType in cls.FB_OBJECT_TYPES: args['type'] = objectType result = cls.makeRequest('search', args) elif objectType == 'all': # search all object types for obj_type in cls.FB_OBJECT_TYPES: args['type'] = obj_type result.extend(cls.makeRequest('search', args)) else: raise ValueError('Illegal Object type %s' % (objectType)) return result @classmethod def makeRequest(cls, path, args={}, maxDoc=None, method='get'): ''' makes a request to the graph API @param path: path to query, e.g. feed of user/group/page 122222: 122222/feed @return: fetched data ''' if not 'access_token' in args: # args['access_token'] = "b07f413baf9650d2363f1c8813ece6da" #very # unflexible, its hardcoded... args['access_token'] = FACEBOOK_ACCESS_KEY if method == 'post': args['method'] = 'POST' url = "https://graph.facebook.com/%s?%s" % ( path, urllib.parse.urlencode(args)) result = cls._requestURL(url, maxDoc) return result def getJsonListStructure(self): request = None args = {} args['q'] = self.term if isinstance(args['q'], str): args['q'] = args['q'].encode('utf-8') if self.since: args['since'] = int(self.since) if self.limit: args['limit'] = self.limit if self.objectType == 'path': args_string = '' if 'q' in args: del args['q'] if len(args): args_string = '?%s' % urllib.parse.urlencode(args) request = {'method': "GET", "relative_url": '%s/%s%s' % (self.GRAPH_API_VERSION, self.term, args_string)} elif self.objectType in self.FB_OBJECT_TYPES: args['type'] = self.objectType request = {'method': "GET", "relative_url": "/search?%s" % urllib.parse.urlencode(args)} elif self.objectType == 'all': for obj_type in self.FB_OBJECT_TYPES: if obj_type == 'path': continue args['type'] = obj_type request = {'method': "GET", "relative_url": "/search?" + urllib.parse.urlencode(args)} return request @classmethod def _requestURL(cls, url, maxDoc=None, result=None, tried=None): ''' fetches the data for the give URL from the graph API @param url: valid graph-api-url @return: fetched data ''' if not result: result = [] if not maxDoc: maxDoc = 1000 if not tried: tried = False try: f = cls.retrieve.open(url) fetched = json.loads(f.read()) tried = True logging.debug('processing url %s' % url) if isinstance(fetched, dict): if 'data' in fetched: if not len(fetched['data']): return result result.extend(fetched['data']) # process paging if len(result) < maxDoc: if 'paging' in fetched and 'previous' in fetched['paging']: result = (cls._requestURL(fetched['paging']['previous'], maxDoc, result)) print('After processing paging', len(result)) else: # profiles for example don't contain a data dictionary result.append(fetched) print('After appending fetched', len(result)) except HTTPError as e: print('Error: Bad Request for url %s: %s' % (url, e)) if not tried: result = cls._requestURL(url, maxDoc, result, True) except (timeout, URLError) as e: print('URLError for url %s: %s' % (url, e)) if not tried: result = cls._requestURL(url, maxDoc, result, True) return result
def __init__(self, api_key, api_url=API_URL): ''' Constructor ''' WebDataSource.__init__(self) self.api_key = api_key self.api_url = api_url self.retrieve = Retrieve('google-plus')
class GooglePlus(object): ''' classdocs ''' def __init__(self, api_key, api_url=API_URL): ''' Constructor ''' WebDataSource.__init__(self) self.api_key = api_key self.api_url = api_url self.retrieve = Retrieve('google-plus') def search(self, search_terms, max_results=DEFAULT_MAX_RESULTS): ''' searches Google+ for the given search_terms :param search_terms: search terms :type search_terms: list :param max_results: maximum number of result :type max_results: int :returns: generator with the result ''' for search_term in search_terms: if isinstance(search_term, str): search_term = search_term.encode('utf-8') params = { 'query': '"%s"' % search_term, 'orderBy': DEFAULT_ORDER_BY, 'maxResults': max_results } fetched = self.make_request(params, 'activities') for item in fetched['items']: try: yield self.convert_item(item) except Exception as e: logger.info('Error %s occured' % e) continue def get_activity(self, activity_id): ''' returns the activity with the given ID :param activity_id: GooglePlus activity ID :type activity_id: string :returns: mapped result :rtype: dict ''' item = self.make_request(path='activities/%s' % activity_id) return self.convert_item(item) def make_request(self, params=None, path='activities'): ''' executes the request to GooglePlus :param params: paremeters for the query :type params: list or None :param path: path to query, e.g. activities :type path: string :returns: GooglePlus result :rtype: dict ''' url = self.get_request_url(params, path) data = self.retrieve.open(url) return json.load(data) def get_request_url(self, params=None, path='activities'): ''' returns a correctly parsed request URL :param params: paremeters for the query :type params: list or None :param path: path to query, e.g. activities :type path: string :returns: GooglePlus request URL :rtype: str Usage: >>> plus = GooglePlus('abcd') >>> plus.get_request_url() 'https://www.googleapis.com/plus/v1/activities?key=abcd' ''' params = params if params else {} if not 'key' in params: params['key'] = self.api_key if 'maxResults' in params and params[ 'maxResults'] > DEFAULT_MAX_RESULTS: params['maxResults'] = DEFAULT_MAX_RESULTS return self.api_url.format(path=path, query=urlencode(params)) @classmethod def convert_item(cls, item): ''' applies a mapping to convert the result to the required format :param item: GooglePlus Activity :type item: dict :rtype: dict ''' last_modified = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') published = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') content = cls.convert_content(item['object']['content']) if not item['verb'] == 'post': raise Exception('Skipping activity of type "%s"' % item['verb']) if not len(content): logger.info('Skipping "%s" -> content is empty' % item['title']) raise Exception('content is empty') if 'attachments' in item['object']: for attachment in item['object']['attachments']: if attachment['objectType'] == 'article': if not 'content' in attachment: raise Exception('no content in attachment') content = '%s\n"%s" (%s)' % ( content, cls.convert_content( attachment['content']), attachment['url']) activity = { 'content': content, 'title': item['actor']['displayName'], 'url': item['url'], 'last_modified': last_modified, 'user_id': item['actor']['id'], 'user_img_url': item['actor']['image']['url'], 'screen_name': item['actor']['displayName'], 'encoding': u'utf-8', 'user_url': item['actor']['url'], 'valid_from': published, 'reshares': item['object']['resharers']['totalItems'], 'plusoners': item['object']['plusoners']['totalItems'], 'activity_id': item['id'], } if 'geocode' in activity: activity['geocode'] = item['geocode'] return activity
class AmazonWS(object): """ This class provides low level amazon web service access """ def __init__(self, location='us', key=None): """ init """ assert (location in AMAZON_LOCATIONS) self.retrieve = Retrieve(self.__class__.__name__) self.wsBase = AMAZON_LOCATIONS[location] self.accessKey = key or AMAZON_ACCESS_KEY self.amazon_url = AmazonUrl() def generateWsUrl(self, arguments): """ generates a valid amazon webservice request url """ argList = ["%s&SubscriptionId=%s" % (self.wsBase, self.accessKey)] + [ "%s=%s" % (k, quote(v)) for k, v in list(arguments.items()) ] return "&".join(argList) def generateSignedWsUrl(self, **arguments): """ generates a valid amazon webservice request url """ #argList = [ "%s&SubscriptionId=%s" % (self.wsBase, self.accessKey) ] + [ "%s=%s" % (k,quote(v)) for k,v in arguments.items() ] # return "&".join(argList) return self.amazon_url.get_request_url(arguments) def query(self, arguments): """ retrieves a result from amazon webservice """ url = self.generateWsUrl(arguments) done = False while not done: try: f = self.retrieve.open(url) res = f.read() self._write_debug_data(res) f.close() done = True except ValueError: logging.warning( "Exception webservice query - waiting for %d seconds...\n" % ERROR_SLEEP_TIME) time.sleep(ERROR_SLEEP_TIME) return res @staticmethod def _write_debug_data(data): """ writes the given data to the debug file, if specified """ if not AMAZON_DEBUG_FILE: return d = open(AMAZON_DEBUG_FILE, "a") d.write(data) d.close() def searchItem(self, searchIndex='Books', **param): """ searches an item in the amazon product repository """ arguments = { 'Operation': 'ItemSearch', 'SearchIndex': searchIndex, 'BrowseNode': '1000', 'Sort': 'salesrank', 'ResponseGroup': 'SalesRank,Small' } arguments.update(param) return self.query(arguments) def queryReview(self, itemId, **param): """ queries customers reviews to the selected Item """ arguments = { 'Operation': 'ItemLookup', 'ResponseGroup': 'Reviews', 'ItemId': itemId } arguments.update(param) return self.query(arguments) def newReleases(self, **param): """ returns a list of asins of new releases """ arguments = { 'Operation': 'BrowseNodeLookup', 'ResponseGroup': 'NewReleases', 'Marketplace': 'us' } arguments.update(param) return self.query(arguments) def itemAttributes(self, item_id, **param): """ returns all item attribues """ arguments = { 'Operation': 'ItemLookup', 'ItemId': item_id, 'IdType': 'ASIN', 'ResponseGroup': 'ItemAttributes,SalesRank' } arguments.update(param) return self.query(arguments)
class GooglePlus(object): ''' classdocs ''' def __init__(self, api_key, api_url=API_URL): ''' Constructor ''' WebDataSource.__init__(self) self.api_key = api_key self.api_url = api_url self.retrieve = Retrieve('google-plus') def search(self, search_terms, max_results=DEFAULT_MAX_RESULTS): ''' searches Google+ for the given search_terms :param search_terms: search terms :type search_terms: list :param max_results: maximum number of result :type max_results: int :returns: generator with the result ''' for search_term in search_terms: if isinstance(search_term, unicode): search_term = search_term.encode('utf-8') params = {'query': '"%s"' % search_term, 'orderBy': DEFAULT_ORDER_BY, 'maxResults': max_results} fetched = self.make_request(params, 'activities') for item in fetched['items']: try: yield self.convert_item(item) except Exception as e: logger.info('Error %s occured' % e) continue def get_activity(self, activity_id): ''' returns the activity with the given ID :param activity_id: GooglePlus activity ID :type activity_id: string :returns: mapped result :rtype: dict ''' item = self.make_request(path='activities/%s' % activity_id) return self.convert_item(item) def make_request(self, params=None, path='activities'): ''' executes the request to GooglePlus :param params: paremeters for the query :type params: list or None :param path: path to query, e.g. activities :type path: string :returns: GooglePlus result :rtype: dict ''' url = self.get_request_url(params, path) data = self.retrieve.open(url) return json.load(data) def get_request_url(self, params=None, path='activities'): ''' returns a correctly parsed request URL :param params: paremeters for the query :type params: list or None :param path: path to query, e.g. activities :type path: string :returns: GooglePlus request URL :rtype: str Usage: >>> plus = GooglePlus('abcd') >>> plus.get_request_url() 'https://www.googleapis.com/plus/v1/activities?key=abcd' ''' params = params if params else {} if not 'key' in params: params['key'] = self.api_key if 'maxResults' in params and params['maxResults'] > DEFAULT_MAX_RESULTS: params['maxResults'] = DEFAULT_MAX_RESULTS return self.api_url.format(path=path, query=urlencode(params)) @classmethod def convert_item(cls, item): ''' applies a mapping to convert the result to the required format :param item: GooglePlus Activity :type item: dict :rtype: dict ''' last_modified = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') published = datetime.strptime(item['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') content = cls.convert_content(item['object']['content']) if not item['verb'] == 'post': raise Exception('Skipping activity of type "%s"' % item['verb']) if not len(content): logger.info('Skipping "%s" -> content is empty' % item['title']) raise Exception('content is empty') if 'attachments' in item['object']: for attachment in item['object']['attachments']: if attachment['objectType'] == 'article': if not 'content' in attachment: raise Exception('no content in attachment') content = '%s\n"%s" (%s)' % (content, cls.convert_content( attachment['content']), attachment['url']) activity = {'content': content, 'title': item['actor']['displayName'], 'url': item['url'], 'last_modified': last_modified, 'user_id': item['actor']['id'], 'user_img_url': item['actor']['image']['url'], 'screen_name': item['actor']['displayName'], 'encoding': u'utf-8', 'user_url': item['actor']['url'], 'valid_from': published, 'reshares': item['object']['resharers']['totalItems'], 'plusoners': item['object']['plusoners']['totalItems'], 'activity_id': item['id'], } if 'geocode' in activity: activity['geocode'] = item['geocode'] return activity
def __init__(self): self.r = Retrieve( WikiPedia.__name__ )
def __init__(self): self.r = Retrieve(WikiPedia.__name__)