def parse_xml(file): tree = ET.parse(file) root = tree.getroot() i = 0 d= [] for tv in root.findall('programme'): cat = tv.find('category').text if cat == "Film": #Movie name try: title = tv.find('title').text.encode('utf8') except: title = "N/A" #Movie date try: date = tv.find('date').text #.encode('utf8') except: date = "N/A" #Start date try: s = tv.get('start') st = parser.parse(s) lc = locale.getdefaultlocale() locale.setlocale (locale.LC_ALL ,lc) start = st.strftime('%A %C %B - %H:%M GMT+1') except: start = "N/A" #Channel try : c = tv.get('channel').split('.',1)[0] json_data=open('./res/channels.json') data = json.load(json_data) chan = data[c] except: chan = "N/A" #Movie length try: length = tv.find('length').text except: length = "N/A" #IMDB Rating and URL try: rating = ImdbRating(title).rating + "/10" except: rating = "N/A" try: url = ImdbRating(title).url except: import urllib2 url = "http://www.imdb.com/find?q=" + urllib2.quote(title) else: import urllib2 url = "https://duckduckgo.com/?q=" + urllib2.quote(title) #You can remove sleep, if you're sure .. The size of a json file can be more than 20 MB and the scrappping could take a long time sleep(1) d.append({ 'title':title ,'date':date, 'start':start, 'chan':chan, 'length':length, 'rating':rating, 'url':url}) newd = sorted(d, key=lambda k: k['rating'], reverse= True) return newd
def _update_request_uri_query(request): """pulls the query string out of the URI and moves it into the query portion of the request object. If there are already query parameters on the request the parameters in the URI will appear after the existing parameters""" if "?" in request.path: pos = request.path.find("?") query_string = request.path[pos + 1 :] request.path = request.path[:pos] if query_string: query_params = query_string.split("&") for query in query_params: if "=" in query: pos = query.find("=") name = query[:pos] value = query[pos + 1 :] request.query.append((name, value)) request.path = urllib2.quote(request.path, "/()$=',") # add encoded queries to request.path. if request.query: request.path += "?" for name, value in request.query: if value is not None: request.path += name + "=" + urllib2.quote(value, "/()$=',") + "&" request.path = request.path[:-1] return request.path, request.query
def send_message_via_kannel(self, identity, message): backend = PersistantBackend.objects.get(title="kannel") connection = PersistantConnection(backend = backend,identity = identity) #conf = {'kannel_host':'127.0.0.1', 'kannel_port':13013, 'kannel_password':'******', 'kannel_username':'******'} try: conf = settings.RAPIDSMS_CONF["kannel"] url = "http://%s:%s/cgi-bin/sendsms?to=%s&text=%s&password=%s&user=%s" % ( conf["kannel_host"], conf["kannel_port"], urllib2.quote(connection.identity.strip()), urllib2.quote(message), conf['kannel_password'], conf['kannel_username']) f = urllib2.urlopen(url, timeout=10) if f.getcode() / 100 != 2: print "Error delivering message to URL: %s" % url raise RuntimeError("Got bad response from router: %d" % f.getcode()) # do things at a reasonable pace time.sleep(.2) return True except KeyError: return settings.RAPIDSMS_CONF["kannel"]
def _update_request_uri_query(request): '''pulls the query string out of the URI and moves it into the query portion of the request object. If there are already query parameters on the request the parameters in the URI will appear after the existing parameters''' if '?' in request.path: request.path, _, query_string = request.path.partition('?') if query_string: query_params = query_string.split('&') for query in query_params: if '=' in query: name, _, value = query.partition('=') request.query.append((name, value)) request.path = urllib2.quote(request.path, '/()$=\',') #add encoded queries to request.path. if request.query: request.path += '?' for name, value in request.query: if value is not None: request.path += name + '=' + urllib2.quote(value, '/()$=\',') + '&' request.path = request.path[:-1] return request.path, request.query
def test_import_to_shape(self): from gnmvidispine.vs_item import VSItem i = VSItem(host=self.fake_host,port=self.fake_port,user=self.fake_user,passwd=self.fake_passwd) i.name = "VX-123" i.sendAuthorized = MagicMock(return_value=self.MockedResponse(200, self.import_job_doc)) with self.assertRaises(ValueError): i.import_to_shape() #expect ValueError if neither uri nor file ref fake_uri="file:///path/to/newmedia.mxf" quoted_uri=quote(fake_uri,"") #we are embedding a URI as a parameter with another URL so it must be double-encoded i.import_to_shape(uri=fake_uri,shape_tag="shapetagname",priority="HIGH") i.sendAuthorized.assert_called_with('POST', '/API/item/VX-123/shape?priority=HIGH&essence=false&tag=shapetagname&thumbnails=true&uri={0}'.format(quoted_uri) ,"",{'Accept':'application/xml'}, rawData=False) fake_uri = "file:///path/to/" + quote("media with spaces.mxf",safe="/") quoted_uri = quote(fake_uri,"") # we are embedding a URI as a parameter with another URL so it must be double-encoded i.import_to_shape(uri=fake_uri, shape_tag="shapetagname", priority="HIGH") i.sendAuthorized.assert_called_with('POST', '/API/item/VX-123/shape?priority=HIGH&essence=false&tag=shapetagname&thumbnails=true&uri={0}'.format( quoted_uri) , "", {'Accept': 'application/xml'}, rawData=False) fake_uri = "file:///path/to/" + quote("media+with+plusses.mxf",safe="/+") quoted_uri = quote(fake_uri,"") # we are embedding a URI as a parameter with another URL so it must be double-encoded i.import_to_shape(uri=fake_uri, shape_tag="shapetagname", priority="HIGH") i.sendAuthorized.assert_called_with('POST', '/API/item/VX-123/shape?priority=HIGH&essence=false&tag=shapetagname&thumbnails=true&uri={0}'.format( quoted_uri) , "", {'Accept': 'application/xml'}, rawData=False)
def fetch(show, exact=False, ep=None): query_string = '?show=' + quote(show) if exact: query_string = query_string + '&exact=1' if ep: query_string = query_string + '&ep=' + quote(ep) resp = _fetch(BASE_URL + query_string).read() show_info = {} if 'No Show Results Were Found For' in resp: raise ShowNotFound(show) else: data = resp.replace('<pre>', '').splitlines() for line in data: ne in data: try: if '@@' in line: line = line.replace('@@','@') k, v = line.split('@') v = '@' + v else: k, v = line.split('@') except ValueError, err: except ValueError, err: #"Ended@" k = line.replace('@',"") v = ""
def set_language(self): "Set the language" nextpage = request.params.get('next', None) if not nextpage: nextpage = request.headers.get('Referer', None) if not nextpage: nextpage = '/' if '://' in nextpage: from_url = urlparse(nextpage) nextpage = from_url[2] lang_code = request.params.get('language', None) if lang_code and check_language(lang_code): session['lang'] = lang_code session.save() params = [] for param in request.params: if not param in ['language', 'amp']: value = request.params[param] if value: if (param == 'came_from' and '://' in urllib2.unquote(value)): urlparts = urlparse(urllib2.unquote(value)) value = urlparts[2] or '/' params.append('%s=%s' % (urllib2.quote(param), urllib2.quote(value))) if 'lc=1' not in params: params.append('lc=1') if params: nextpage = "%s?%s" % (nextpage, '&'.join(params)) redirect(nextpage)
def send_prowl(title, msg, gtype, force=False, test=None): """ Send message to Prowl """ if test: apikey = test.get('prowl_apikey') else: apikey = sabnzbd.cfg.prowl_apikey() if not apikey: return T('Cannot send, missing required data') title = Tx(NOTIFICATION.get(gtype, 'other')) title = urllib2.quote(title.encode('utf8')) msg = urllib2.quote(msg.encode('utf8')) prio = get_prio(gtype, 'prowl') if force: prio = 0 if prio > -3: url = 'https://api.prowlapp.com/publicapi/add?apikey=%s&application=SABnzbd' \ '&event=%s&description=%s&priority=%d' % (apikey, title, msg, prio) try: urllib2.urlopen(url) return '' except: logging.warning(T('Failed to send Prowl message')) logging.info("Traceback: ", exc_info=True) return T('Failed to send Prowl message') return ''
def get(self,method,args=None): """ GET to DeepDetect server """ u = self.__ddurl u += method headers = {} if args is not None: sep = "?" for arg,argv in args.iteritems(): u += sep sep = "&" u += urllib2.quote(arg) u += '=' if argv is not None: u += urllib2.quote(argv) LOG("GET %s"%u) response = None try: req = urllib2.Request(u) response = urllib2.urlopen(req, timeout=DD_TIMEOUT) jsonresponse=response.read() except: raise DDCommunicationError(u,"GET",headers,None,response) LOG(jsonresponse) try: return self.__return_format(jsonresponse) except: raise DDDataError(u,"GET",headers,None,jsonresponse)
def buildURL(self, params): """RESTリクエストのURLアドレスを構築""" params["Service"] = "AWSECommerceService" params["AWSAccessKeyId"] = self.access_key if self.associate_tag is not None: params["AssociateTag"] = self.associate_tag params["Timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) sorted_params = sorted(params.items()) # paramsのハッシュを展開 request = [] #print sorted_params for p in sorted_params: pair = "%s=%s" % (p[0], urllib2.quote(p[1].encode("utf-8"))) request.append(pair) # 2009/8/15から認証が導入されている # Secret Access Keyを使ってHMAC-SHA256を計算 msg = "GET\nwebservices.amazon.co.jp\n/onca/xml\n%s" % ("&".join(request)) hmac_digest = hmac.new(self.secret_access_key, msg, hashlib.sha256).digest() base64_encoded = base64.b64encode(hmac_digest) signature = urllib2.quote(base64_encoded) # Signatureをリクエストに追加してURLを作成 request.append("Signature=%s" % signature) url = self.amazonurl + "?" + "&".join(request) return url
def get_lyrics(entry, db): global errors global successes title = entry['title'].encode('utf-8') artist = entry['artist'].encode('utf-8') year = entry['year'] artist_clean = urllib2.quote(sanitize_artist(artist).replace(" ", "_")) title_clean = urllib2.quote(sanitize_title(title).replace(" ", "_")) url = 'http://lyrics.wikia.com/' + artist_clean + ':' + title_clean page = requests.get(url) if page.status_code != 200: print "404 error getting lyrics for " + title + " by " + artist + ", " + str(year) errors += 1 else: page_soup = BeautifulSoup(page.text) lyrics = page_soup.select(".lyricbox") if len(lyrics) == 0: print "Parsing error getting lyrics for " + title + " by " + artist + ", " + str(year) errors += 1 return lyrics = lyrics[0] [x.extract() for x in lyrics.findAll('script')] lyrics = lyrics.get_text(' ', strip=True).encode('utf-8') lyrics = santize(lyrics) entry['lyrics'] = lyrics db.entries.save(entry) successes += 1 print "Successfully extracted lyrics for " + title + " by " + artist
def sb_search(): sickbeard = {} params = '' try: params = '&name=%s' % (urllib2.quote(request.args['name'])) except: pass try: params = '&tvdbid=%s' % (urllib2.quote(request.args['tvdbid'])) except: pass try: params = '&lang=%s' % (urllib2.quote(request.args['lang'])) except: pass if params is not '': params = '/?cmd=sb.searchtvdb%s' % params try: sickbeard = sickbeard_api(params) sickbeard = sickbeard['data']['results'] except: sickbeard = None else: sickbeard = None return render_template('sickbeard-search.html', data=sickbeard, sickbeard='results', )
def build_query(self): """ Builds query to access to cghub server. """ parts = [] for key, value in self.query.iteritems(): if isinstance(value, list) or isinstance(value, tuple): value_str = '+OR+'.join([ self.escape_query_value(key, v) for v in value]) value_str = '(%s)' % value_str else: value_str = self.escape_query_value(key, value) parts.append('='.join([key, value_str])) if self.offset: parts.append('='.join(['start', str(self.offset)])) if self.limit: parts.append('='.join(['rows', str(self.limit)])) if self.sort_by: if self.sort_by[0] == '-': parts.append('='.join([ 'sort_by', '%s:desc' % urllib2.quote(self.sort_by[1:])])) else: parts.append('='.join([ 'sort_by', '%s:asc' % urllib2.quote(self.sort_by)])) return '&'.join(parts)
def searchBook(isbn_num): logText("Searching for: ", isbn_num) query = "AWSAccessKeyId=" + AWSAccessKeyID + "&AssociateTag=abc&Keywords=" query += isbn_num query += "&Operation=ItemSearch&ResponseGroup=ItemAttributes&SearchIndex=Books&Service=AWSECommerceService" query += "&Timestamp=" + urllib2.quote(datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"))[:-1] # query += "&Version=2011-08-01" data = "GET\n" data += "ecs.amazonaws.com\n" data += "/onca/xml\n" data += query a = hmac.new(AWSSecret, data, hashlib.sha256) signature = urllib2.quote(base64.encodestring(a.digest())[:-1]) url = "http://ecs.amazonaws.com/onca/xml?" + query + "&Signature=" + signature # print "URL : ", url url_obj = urllib2.urlopen(url) data = url_obj.read() book_info = getInfoFromXML(data) logText( " - Title: ", book_info[0]) logText( " - Price: ", book_info[1]) storeInDB( (book_info[0], isbn_num, book_info[1]) )
def get_lat_lng(address, city, state): c = config.load() # If address is a PO Box, skip if re.search('P(\.)?O(\.)?(\sBox\s)[0-9]+', address) is not None or address == '': return None else: url = 'https://api.smartystreets.com/street-address?' url += 'state=' + urllib2.quote(str(state)) url += '&city=' + urllib2.quote(str(city)) url += '&auth-id=' + c['ss_id'] url += '&auth-token=' + c['ss_token'] url += '&street=' + urllib2.quote(str(address)) result = json.load(urllib2.urlopen(url)) if len(result) == 1: lat_lng = {'lat': result[0]['metadata']['latitude'], 'lng': result[0]['metadata']['longitude']} return lat_lng elif len(result) == 0: # return generic lat/lng if zero results so we can come back later to fix it lat_lng = {'lat': 36.0, 'lng': -76.0} return lat_lng else: print result exit(-1)
def http_get(self, url): MIME = '*/*' unquoteurl = urllib2.unquote(url.encode('utf-8')) scheme,netloc,url,params,query,fragment = urlparse(unquoteurl) netloc=urllib2.quote(netloc) url = urllib2.quote(url) url = ParseResult( scheme,netloc,url,params,query,fragment ).geturl() retries = 30 i = 0 while(True): try: if(self.useproxy): print 'using proxy' response = self.opener.open(url,timeout=5) print("GET " + urllib2.unquote(response.geturl().encode()) + " " + str(response.code)) if('content-type' in response.headers): MIME = response.headers['content-type'].split(';')[0] print response return response.read(), response.code, MIME else: response = requests.get(url) print("GET " + urllib2.unquote(str(response.url)) + " " + str(response.status_code)) if('content-type' in response.headers): MIME = response.headers['content-type'].split(';')[0] return response.content, response.status_code, MIME except: if(i > retries): print traceback.print_exc() raise sys.exc_info()[0] print "timeout 5000ms" i += 1
def _get_archived_json_results(self): """Download JSON file that only contains test name list from test-results server. This is for generating incremental JSON so the file generated has info for tests that failed before but pass or are skipped from current run. Returns (archived_results, error) tuple where error is None if results were successfully read. """ results_json = {} old_results = None error = None if not self._test_results_server: return {}, None results_file_url = (self.URL_FOR_TEST_LIST_JSON % (urllib2.quote(self._test_results_server), urllib2.quote(self._builder_name), self.RESULTS_FILENAME, urllib2.quote(self._test_type), urllib2.quote(self._master_name))) try: # FIXME: We should talk to the network via a Host object. results_file = urllib2.urlopen(results_file_url) info = results_file.info() old_results = results_file.read() except urllib2.HTTPError, http_error: # A non-4xx status code means the bot is hosed for some reason # and we can't grab the results.json file off of it. if (http_error.code < 400 and http_error.code >= 500): error = http_error
def get_s3_files_table(prefix): ''' list files from s3, to be used with table listing; return dicts ''' bucket_name = os.environ['BUCKET'] try: ak, sk = get_env_creds() s3 = boto.connect_s3(aws_access_key_id=ak, aws_secret_access_key=sk) bucket = s3.get_bucket(bucket_name) except: logging.error('get_s3_files: Could not connect to AWS/Bucket: %s' % str(sys.exc_info())) files = bucket.list_versions(prefix=prefix) filelist = [] for f in files: if type(f) is not boto.s3.key.Key: continue size_in_mb = '%.2f' % (float(f.size) / (1024*1024)) key = f.name[len(prefix):] directory = key.partition('/')[0] filename = key.partition('/')[-1] cb64 = urllib2.quote((f.name).encode('base64').rstrip()) vb64 = urllib2.quote(f.version_id.encode('base64').rstrip()) dfmt = '%Y-%m-%dT%H:%M:%S.000Z' date = datetime.strptime(f.last_modified, dfmt) d = { 'name' : filename, 'dir' : directory, 'v_id' : f.version_id, 'date' : date, 'size' : size_in_mb, 'cb64' : cb64, 'vb64' : vb64, 'key' : key} filelist.append(d) return filelist
def responseMsg(request): rawStr = smart_str(request.body) #rawStr = smart_str(request.POST['XML']) msg = paraseMsgXml(ET.fromstring(rawStr)) queryStr = msg.get('Content','You have input nothing~') msgType = msg.get('MsgType', 'text') raw_youdaoURL = "http://fanyi.youdao.com/openapi.do?keyfrom=%s&key=%s&type=data&doctype=%s&version=1.1&q=" % (YOUDAO_KEY_FROM,YOUDAO_KEY,YOUDAO_DOC_TYPE) event = msg.get('Event', '') if msgType == 'event': result = getBasicReply(msg, '欢迎使用,发送单词或者中文词语,将获得相应的解释;如果需要单词的读音,请在单词前面添加一个点,如.hello;欢迎推荐给你们de小伙伴们') elif queryStr.startswith('.'): queryStr = queryStr[1:] youdaoURL = "%s%s" % (raw_youdaoURL,urllib2.quote(queryStr)) req = urllib2.Request(url=youdaoURL) result = urllib2.urlopen(req).read() replyContent = getPronounce(ET.fromstring(result)) result = getReplyXml(msg,replyContent, queryStr) else: youdaoURL = "%s%s" % (raw_youdaoURL,urllib2.quote(queryStr)) req = urllib2.Request(url=youdaoURL) result = urllib2.urlopen(req).read() replyContent = paraseYouDaoXml(ET.fromstring(result)) result = getBasicReply(msg,replyContent) return result
def lastfm_info(tracktuple, trinfo): if tracktuple[0] != '': mbid = '&mbid=' + tracktuple[0] else: mbid = '' artist = urllib2.quote(tracktuple[1].encode('utf-8')) songtitle = urllib2.quote(tracktuple[2].encode('utf-8')) query = 'http://ws.audioscrobbler.com/2.0/?method=track.getInfo&api_key='\ + LASTFM_KEY + mbid + '&artist=' + artist + '&track='\ + songtitle + '&format=json' response = json.loads(urllib2.urlopen(query).read()) result = None try: result = response['track'] except KeyError: global lastfm_failed print '?? No result for', tracktuple, 'on last.fm' print ' ', response lastfm_failed.append(tracktuple) if result != None: trinfo['track']['name'] = response['track']['name'] try: album_response = response['track']['album'] trinfo['track']['album'] = {} trinfo['track']['album']['title'] = album_response['title'] trinfo['track']['album']['url'] = album_response['url'] trinfo['track']['album']['artist'] = album_response['artist'] trinfo['track']['album']['mbid'] = album_response['mbid'] except KeyError: print '?? No album for', trinfo['track']['name'] trinfo['track']['artist'] = response['track']['artist'] trinfo['track']['toptags'] = response['track']['toptags'] trinfo['track']['id']['musicbrainz'] = response['track']['mbid'] trinfo['track']['duration'] = response['track']['duration'] print trinfo['track']['name'], 'succesfully appended' return trinfo
def baiduMusic(self, musicTitle, musicAuthor): baseurl = r"http://box.zhangmen.baidu.com/x?op=12&count=1&title=%s$$%s$$$$" % \ (urllib2.quote(musicTitle.encode("utf-8")),urllib2.quote(musicAuthor.encode("utf-8"))) resp = urllib2.urlopen(baseurl) xml = resp.read() #.*?是只获取<url>之间的数据 普通url url = re.findall('<url>.*?</url>',xml) #.*?是只获取<durl>之间的数据 高品质url durl = re.findall('<durl>.*?</durl>',xml) #获取第一个url中encode标签的数据 url1 = re.findall('<encode>.*?CDATA\[(.*?)\]].*?</encode>',url[0]) url2 = re.findall('<decode>.*?CDATA\[(.*?)\]].*?</decode>',url[0]) #取出url1最后一个 /(包含) 之前的数据加上url2最后一个 &(不包含) 之前的数据 urlpath = url1[0][:url1[0].rindex('/')+1] + url2[0][:url2[0].rindex('&')] durlpath = "" if durl: durl1 = re.findall('<encode>.*?CDATA\[(.*?)\]].*?</encode>',durl[0]) durl2 = re.findall('<decode>.*?CDATA\[(.*?)\]].*?</decode>',durl[0]) durlpath = durl1[0][:durl1[0].rindex('/')+1] + durl2[0][:durl2[0].rindex('&')] return urlpath, durlpath
def translate(phrase, in_lang): if in_lang == "en": out_lang = "ja" else: out_lang = "en" if True: url = ( "http://api.microsofttranslator.com/V2/Ajax.svc/GetTranslations?appId=F2926FC35C3732CEC3E9C92913745F9C28912821&from=" + in_lang + "&to=" + out_lang + "&maxTranslations=1" ) url += "&text=" + quote(phrase.encode("utf-8")) response = urlfetch.fetch(url=url) content = re.sub(u"\xEF\xBB\xBF", "", response.content) data = json.loads(content) translated_text = data["Translations"][0]["TranslatedText"] time.sleep(0.1) else: url = "https://www.googleapis.com/language/translate/v2?" url += "&source=" + in_lang url += "&target=" + out_lang url += "&q=" + quote(phrase.encode("utf-8")) url += "&key=" + "AIzaSyAI3PoUAJ_uP0o33EDgUfSEUMALepQAaNA" content = urlfetch.fetch(url=url).content data = json.loads(content) translated_text = data["data"]["translations"][0]["translatedText"] return translated_text
def releaseId(self, _atExit=False): postData= {} postData['wantedId']= self.lastId postData['logName']= urllib2.quote(self.parentDB.config.projectUser) if self.settings.login!='' and self.settings.passw!='': postData['logName']= urllib2.quote(self.settings.login) postData['logPass']= urllib2.quote(self.settings.passw) postData['rep']= self.settings.base postData['project']= urllib2.quote(self.parentDB.config.projectName) req = urllib2.Request('http://' +self.settings.addr +'/?=release_task_id', str.encode(urllib.urlencode(postData))) try: response= bytes.decode( urllib2.urlopen(req, None, self.timeout).read() ) or 0 self.lastId= None except Exception as e: print('TypeTodo: HTTP server error releasing todo') print(e) return False if str(int(response)) != response: print('TypeTodo: HTTP server fails releasing todo') response= False return response
def get_SIMBAD_coordinates(name): url = VOTABLE_OPTIONS + SIMBAD_VOTABLE_SCRIPT_START + QUERY_VOTABLE_FULLCOORDINATES + SIMBAD_VOTABLE_SCRIPT_MIDDLE + name + SIMBAD_VOTABLE_SCRIPT_END try: response = urllib2.urlopen(SIMBAD_ROOT_1+NAME_SCRIPT+urllib2.quote(url)) except urllib2.URLError: try: response = urllib2.urlopen(SIMBAD_ROOT_2+NAME_SCRIPT+urllib2.quote(url)) except urllib2.URLError: return None try: response_votable = votable.parse(response.fp) first_table = response_votable.get_first_table() except: return None else: ra = float(first_table.array[0][0]) dec = float(first_table.array[0][1]) try: coords, created = AstronomicalCoordinates.objects.get_or_create(right_ascension=ra, declination=dec) except MultipleObjectsReturned: coords = AstronomicalCoordinates.objects.filter(right_ascension=ra, declination=dec).first() return coords
def get_SIMBAD_object_types(name): url = SIMBAD_BASIC_SCRIPT + QUERY_OTYPES + name try: response = urllib2.urlopen(SIMBAD_ROOT_1+NAME_SCRIPT+urllib2.quote(url)) except urllib2.URLError: try: response = urllib2.urlopen(SIMBAD_ROOT_2+NAME_SCRIPT+urllib2.quote(url)) except urllib2.URLError: return None otypes = [] ok = False value_line = None for line in response.readlines(): if ok and len(line.strip()) > 0: value_line = line.strip() if line.find(QUERY_DATA_DELIMITER) >= 0: ok = True if value_line is not None and len(value_line) > 0: values = value_line.split(",") for value in values: otype, created = ObjectType.objects.get_or_create(value=value) otypes.append(otype) return otypes
def plos_search(query, query_type = None, rows = 20, more_parameters = None, fq = '''doc_type:full AND article_type:"Research Article"''', output = "json", verbose = False): ''' Accesses the PLOS search API. query: the text of your query. query_type: subject, author, etc. rows: maximum number of results to return. more_parameters: an optional dictionary; key-value pairs are parameter names and values for the search api. fq: determines what kind of results are returned. Set by default to return only full documents that are research articles (almost always what you want). output: determines output type. Set to JSON by default, XML is also possible, along with a few others. ''' api_key = "..." query_string = "" if query_type: query_string += query_type + ":" query_string += '"' + query + '"' params_string = "" if more_parameters: params_string = "&" + "&".join([key + "=" + quote(value) for key, value in more_parameters.iteritems()]) fq_string = "&fq=" + quote(fq) url = "http://api.plos.org/search?q=" + query_string + params_string + fq_string + "&wt=" + output + "&rows=" + str(rows) + "&api_key=" + api_key headers = {'Content-Type': 'application/' + output} if verbose: print url r = requests.get(url, headers=headers) r.encoding = "UTF-8" # just to be sure return r.json()["response"]["docs"]
def decorated_function(*args, **kwargs): page = int(request.args.get('page', 1)) # 这里要转换成str类型, 否则会报类型错误 _path = request.path.encode("utf-8") # 对于非ASCII的URL,需要进行URL编码 if quote(_path).count('%25') <= 0: _path = quote(_path) _viewkey = 'mobile%s' % _path if request.MOBILE else _path cache_key = str(key % _viewkey) if page > 1: cache_key = '%s_%s' % (cache_key, page) rv = cache.get(cache_key) if rv is not None: return rv rv = f(*args, **kwargs) _suffix = u"\n<!-- cached at %s -->" % str(datetime.datetime.now()) if hasattr(rv, "data"): rv.data += _suffix if isinstance(rv, unicode): rv += _suffix cache.set(cache_key, rv, timeout) return rv
def fetch(self): postData= {} postData['rep']= self.settings.base postData['project']= urllib2.quote(self.parentDB.config.projectName) if self.settings.login!='' and self.settings.passw!='': postData['logName']= urllib2.quote(self.settings.login) postData['logPass']= urllib2.quote(self.settings.passw) req = urllib2.Request('http://' +self.settings.addr +'/?=fetch_tasks', str.encode(urllib.urlencode(postData))) try: response= bytes.decode( urllib2.urlopen(req, None, self.timeout).read() ) except Exception as e: print('TypeTodo: cant fetch http') print(e) return False todoA= {} for task in json.loads(response): __id= int(task['id']) if __id not in todoA: todoA[__id]= TodoTask(__id, self.parentDB.config.projectName, self.parentDB) fetchedStateName= task['namestate'] for cState in STATE_LIST: if cState and cState[1]==fetchedStateName: break tags= task['nametag'].split(',') todoA[__id].set((cState or STATE_DEFAULT)[0], tags, task['priority'], task['namefile'], task['comment'], task['nameuser'], int(task['ustamp'])) return todoA
def _generate_url(self, options): options['Service'] = 'AWSECommerceService' options['AWSAccessKeyId'] = self.access_key_id options['AssociateTag'] = self.associate_tag options['Timestamp'] = self._generate_timestamp() # 'None' が含まれている場合は削除する. for k, v in options.items(): if v is None: del options[k] # 署名(v2)を作成する. keys = sorted(options.keys()) args = '&'.join('%s=%s' % (key, urllib2.quote(unicode(options[key]) .encode('utf-8'), safe='~')) for key in keys) msg = 'GET' msg += '\n' + self.uri msg += '\n' + self.end_point msg += '\n' + args hmac.new(self.secret_key or '', msg, hashlib.sha256).digest() signature = urllib2.quote( base64.b64encode(hmac.new(self.secret_key or '', msg, hashlib.sha256).digest())) url = "http://%s%s?%s&Signature=%s" % (self.uri, self.end_point, args, signature) return url
def __raw_search_anime(self, query): h = httplib2.Http() resp, content = h.request(self.malapiurl + '/anime/search?q=' + urllib2.quote(query)) print self.malapiurl + '/anime/search?q=' + urllib2.quote(query) if int(resp['status']) != 200: return None return content
def encode(url): return urllib2.quote(url).replace("/", "%2F")
def urlencode(s): return urllib2.quote(s)
def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None): '''Paginate through github's web interface and scrape summaries''' # repo_url - https://github.com/ansible/ansible for example # baseurl - an entrypoint for one-off utils to scrape specific issue # query urls. NOTE: this disables writing a cache # get cached if not baseurl: issues = self.load_summaries(repo_url) else: issues = {} if not baseurl: url = repo_url url += '/issues' url += '?' url += 'q=' url += urllib2.quote('sort:updated-desc') else: url = baseurl namespace = repo_url.split('/')[-2] reponame = repo_url.split('/')[-1] rr = self._request_url(url) soup = BeautifulSoup(rr.text, 'html.parser') data = self._parse_issue_summary_page(soup) if data['issues']: # send to receiver post_to_receiver('summaries', { 'user': namespace, 'repo': reponame }, data['issues']) # update master list issues.update(data['issues']) if not baseurl: self.dump_summaries_tmp(repo_url, issues) while data['next_page']: rr = self._request_url(self.baseurl + data['next_page']) soup = BeautifulSoup(rr.text, 'html.parser') data = self._parse_issue_summary_page(soup) # send to receiver post_to_receiver('summaries', { 'user': namespace, 'repo': reponame }, data['issues']) if not data['next_page'] or not data['issues']: break changed = [] changes = False for k, v in data['issues'].iteritems(): if not isinstance(k, unicode): k = u'%s' % k if k not in issues: changed.append(k) changes = True elif v != issues[k]: changed.append(k) changes = True issues[k] = v if changed: logging.info('changed: %s' % ','.join(x for x in changed)) if not baseurl: self.dump_summaries_tmp(repo_url, issues) if not changes: break # get missing if not baseurl: numbers = sorted([int(x) for x in issues.keys()]) missing = [x for x in xrange(1, numbers[-1]) if x not in numbers] for x in missing: summary = self.get_single_issue_summary(repo_url, x, force=True) if summary: post_to_receiver('summaries', { 'user': namespace, 'repo': reponame }, {x: summary}) if not isinstance(x, unicode): x = u'%s' % x issues[x] = summary # get missing timestamps if not baseurl: numbers = sorted([int(x) for x in issues.keys()]) missing = [ x for x in numbers if str(x) not in issues or not issues[str(x)]['updated_at'] ] for x in missing: summary = self.get_single_issue_summary(repo_url, x, force=True) if summary: post_to_receiver('summaries', { 'user': namespace, 'repo': reponame }, {x: summary}) if not isinstance(x, unicode): x = u'%s' % x issues[x] = summary # save the cache if not baseurl: self.dump_summaries(repo_url, issues) return issues
################################################################################### import scraperwiki import simplejson import urllib2 # Change QUERY to your search term of choice. # Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight' QUERY ='wish list' RESULTS_PER_PAGE = '200' LANGUAGE = 'en' NUM_PAGES = 1500 for page in range(1, NUM_PAGES+1): base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&lang=%s&page=%s' \ % (urllib2.quote(QUERY), RESULTS_PER_PAGE, LANGUAGE, page) try: results_json = simplejson.loads(scraperwiki.scrape(base_url)) for result in results_json['results']: #print result data = {} data['id'] = result['id'] data['text'] = result['text'] data['from_user'] = result['from_user'] data['created_at'] = result['created_at'] print data['from_user'], data['text'] scraperwiki.sqlite.save(["id"], data) except: print 'Oh dear, failed to scrape %s' % base_url break
def xiaoqu_chengjiao_spider(db_cj, xq_name=u"绿川新苑"): trytimes = 0 # tryblocktimes = 0 url = u"http://sh.lianjia.com/chengjiao/rs" + urllib2.quote(xq_name) + "/" while 1: try: # proxy_s = urllib2.ProxyHandler(proxys[random.randint(0, len(proxys)-1)]) # opener = urllib2.build_opener(proxy_s) # urllib2.install_opener(opener) req = urllib2.Request(url, headers=hds[random.randint(0, len(hds) - 1)]) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor) source_code = opener.open(req, timeout=5).read() plain_text = unicode(source_code) #,errors='ignore') soup = BeautifulSoup(plain_text) except socket.timeout as e: if trytimes < 5: #time.sleep(5) trytimes += 1 continue else: print e exception_write(e, 'xiaoqu_chengjiao_spider', xq_name) return except (urllib2.HTTPError, urllib2.URLError) as e: print e exception_write(e, 'xiaoqu_chengjiao_spider', xq_name) return except Exception as e: print e exception_write(e, 'xiaoqu_chengjiao_spider', xq_name) return human = soup.find('div', {'class': 'human'}) if not human: break else: print "block" time.sleep(random.randint(900, 1200)) trytimes = 0 # if tryblocktimes < 5: # tryblocktimes += 1 # continue # else: # print "block" # getProxyIp(); # trytimes = 0 # tryblocktimes = 0 pagebox = soup.find('div', {'class': 'c-pagination'}) if not pagebox: print "---no chengjiao record" return tpage = pagebox.find('a', {'gahref': 'results_totalpage'}) npage = pagebox.find('a', {'gahref': 'results_next_page'}) allpage = pagebox.findAll('a') if tpage: pagenum = int(tpage['gahref'].split('_d')[-1]) else: if npage: pagenum = int(allpage[-2]['gahref'].split('_d')[-1]) else: pagenum = 1 print u"---开始爬 %s 区全部的信息" % xq_name print u"---total number of pages is " + str(pagenum) for j in range(pagenum): url_page = u"http://sh.lianjia.com/chengjiao/d%drs%s/" % (j + 1, xq_name) chengjiao_page_search(db_cj, url_page) #time.sleep(random.randint(1,2)) print u"---" + xq_name + " " + str(j + 1) + "th page have been done" print u"---爬下了 %s 区全部的信息" % xq_name
def _QuoteOrNone(self, x): if x is None: return None else: return urllib2.quote(x)
import scraperwiki import simplejson import urllib2 # Get results from the Twitter API! Change QUERY to your search term of choice. # Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight' QUERY = '#BeforeBlackPresidents' RESULTS_PER_PAGE = '100' LANGUAGE = 'en' # NUM_PAGES = 5 # for page in range(1, NUM_PAGES+1): base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&lang=%s&page=%s' % ( urllib2.quote(QUERY), RESULTS_PER_PAGE, LANGUAGE, 1) try: print simplejson.loads(scraperwiki.scrape(base_url)) except: print 'Oh dear, failed to scrape %s' % base_url # Blank Python import scraperwiki import simplejson import urllib2 # Get results from the Twitter API! Change QUERY to your search term of choice. # Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight' QUERY = '#BeforeBlackPresidents'
# To get the item log we connect to NZBGet via XML-RPC and call # method "loadlog", which returns the log for a given nzb item. # For more info visit http://nzbget.net/RPC_API_reference # First we need to know connection info: host, port and password of NZBGet server. # NZBGet passes all configuration options to post-processing script as # environment variables. host = os.environ['NZBOP_CONTROLIP']; port = os.environ['NZBOP_CONTROLPORT']; username = os.environ['NZBOP_CONTROLUSERNAME']; password = os.environ['NZBOP_CONTROLPASSWORD']; if host == '0.0.0.0': host = '127.0.0.1' # Build a URL for XML-RPC requests rpcUrl = 'http://%s:%s@%s:%s/xmlrpc' % (quote(username), quote(password), host, port); # Create remote server object server = ServerProxy(rpcUrl) # Call remote method 'loadlog' nzbid = int(os.environ['NZBPP_NZBID']) log = server.loadlog(nzbid, 0, 10000) # Now iterate through entries and save them to the output file if len(log) > 0: f = open('%s/_nzblog.txt' % os.environ['NZBPP_DIRECTORY'], 'wb') for entry in log: f.write((u'%s\t%s\t%s\n' % (entry['Kind'], datetime.datetime.fromtimestamp(int(entry['Time'])), entry['Text'])).encode('utf8')) f.close()
url = 'http://www.baidu.com' User_Agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36' response = urllib2.urlopen(url) print response.getcode() print len(response.read()) request = urllib2.Request(url) request.add_header('User-Agent',User_Agent) response = urllib2.urlopen(request) print response.getcode() print len(response.read()) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) response = urllib2.urlopen(url) print response.getcode() print len(response.read()) print cj url = "http://music.baidu.com/artist" response = urllib2.urlopen(url) print response.getcode() print len(response.read()) url = 'http://www.baidu.com' request = urllib2.Request(url) request.add_data({"a",1}) request.add_data({"param",urllib2.quote('中文')}) print request.getUrl()
def update(self, data): # Only run an update on the ticket if a review ID has been found if self.rid: logging.info("We found a review, %u" % self.rid) try: headers = { "Accept": "*/*", "Authorization": "Basic %s" % self.auth } post_data = None if version == 3: post_data = "api_format=json&ship_it=0&body_top=%s&body_bottom=&public=1" % urllib.parse.quote( data) else: post_data = "api_format=json&ship_it=0&body_top=%s&body_bottom=&public=1" % urllib2.quote( data) if not debug: logging.info("Dispatching request to reviews.apache.org") conn = None if version == 3: conn = http.client.HTTPSConnection( "reviews.apache.org", 443) else: conn = httplib.HTTPSConnection("reviews.apache.org", 443) conn.request("POST", "/api/review-requests/%u/reviews/" % self.rid, post_data, headers) response = conn.getresponse() if response.status == 201: logging.info("Posted ReviewBoard update") else: logging.warning( "ReviewBoard instance returned status code %u" % response.status) else: logging.warning( "Foreground mode enabled, no actual ReviewBoard update made" ) except: pass
import scraperwiki import simplejson import urllib2 # Change QUERY to your search term of choice. # Examples: 'newsnight', 'from:bbcnewsnight', 'to:bbcnewsnight' QUERY = 'claudioalfonso' RESULTS_PER_PAGE = '100' RESULT_TYPE = 'old' NUM_PAGES = 500 ENTITIES = 'true' for page in range(1, NUM_PAGES + 1): base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&page=%s&result_type=%s&include_entities=%s' \ % (urllib2.quote(QUERY), RESULTS_PER_PAGE, page, RESULT_TYPE, ENTITIES) try: results_json = simplejson.loads(scraperwiki.scrape(base_url)) for result in results_json['results']: #print result data = {} data['id'] = result['id'] data['text'] = result['text'] data['from_user'] = result['from_user'] data['created_at'] = result['created_at'] data['geo'] = result['geo'] data['entities'] = result['entities'] print data['from_user'], data['text'], data['geo'], data[ 'entities'] scraperwiki.sqlite.save(["id"], data) except:
def quote(self, string, safe='/'): return urllib2.quote(string, safe)
def handleRequest(self, headers_only, channelName=None, channelIcon=None, fmt=None): logger = logging.getLogger('handleRequest') logger.debug("Headers:\n" + str(self.headers)) self.requrl = urlparse.urlparse(self.path) self.reqparams = urlparse.parse_qs(self.requrl.query) self.path = self.requrl.path[:-1] if self.requrl.path.endswith('/') else self.requrl.path # Check if third parameter exists # …/pid/blablablablabla/video.mpg # |_________| # And if it ends with regular video extension try: if not self.path.endswith(('.3gp', '.avi', '.flv', '.mkv', '.mov', '.mp4', '.mpeg', '.mpg', '.ogv', '.ts')): logger.error("Request seems like valid but no valid video extension was provided") self.dieWithError(400) return except IndexError: self.dieWithError(400) # 400 Bad Request return # Limit concurrent connections if 0 < AceConfig.maxconns <= AceStuff.clientcounter.total: logger.debug("Maximum connections reached, can't serve this") self.dieWithError(503) # 503 Service Unavailable return # Pretend to work fine with Fake or HEAD request. if headers_only or AceConfig.isFakeRequest(self.path, self.reqparams, self.headers): # Return 200 and exit if headers_only: logger.debug("Sending headers and closing connection") else: logger.debug("Fake request - closing connection") self.send_response(200) self.send_header("Content-Type", "video/mpeg") self.end_headers() self.closeConnection() return # Make list with parameters self.params = list() for i in xrange(3, 8): try: self.params.append(int(self.splittedpath[i])) except (IndexError, ValueError): self.params.append('0') self.url = None self.video = None self.path_unquoted = urllib2.unquote(self.splittedpath[2]) contentid = self.getCid(self.reqtype, self.path_unquoted) cid = contentid if contentid else self.path_unquoted logger.debug("CID: " + cid) self.client = Client(cid, self, channelName, channelIcon) self.vlcid = urllib2.quote(cid, '') shouldStart = AceStuff.clientcounter.add(cid, self.client) == 1 try: # Initializing AceClient if shouldStart: if contentid: self.client.ace.START('PID', {'content_id': contentid}) elif self.reqtype == 'pid': self.client.ace.START( self.reqtype, {'content_id': self.path_unquoted, 'file_indexes': self.params[0]}) elif self.reqtype == 'torrent': paramsdict = dict( zip(aceclient.acemessages.AceConst.START_TORRENT, self.params)) paramsdict['url'] = self.path_unquoted self.client.ace.START(self.reqtype, paramsdict) logger.debug("START done") # Getting URL self.url = self.client.ace.getUrl(AceConfig.videotimeout) # Rewriting host for remote Ace Stream Engine self.url = self.url.replace('127.0.0.1', AceConfig.acehost) self.errorhappened = False if shouldStart: logger.debug("Got url " + self.url) # If using VLC, add this url to VLC if AceConfig.vlcuse: # Force ffmpeg demuxing if set in config if AceConfig.vlcforceffmpeg: self.vlcprefix = 'http/ffmpeg://' else: self.vlcprefix = '' self.client.ace.pause() # Sleeping videodelay gevent.sleep(AceConfig.videodelay) self.client.ace.play() AceStuff.vlcclient.startBroadcast( self.vlcid, self.vlcprefix + self.url, AceConfig.vlcmux, AceConfig.vlcpreaccess) # Sleep a bit, because sometimes VLC doesn't open port in # time gevent.sleep(0.5) self.hanggreenlet = gevent.spawn(self.hangDetector) logger.debug("hangDetector spawned") gevent.sleep() # Building new VLC url if AceConfig.vlcuse: self.url = 'http://' + AceConfig.vlchost + \ ':' + str(AceConfig.vlcoutport) + '/' + self.vlcid logger.debug("VLC url " + self.url) # Sending client headers to videostream self.video = urllib2.Request(self.url) for key in self.headers.dict: self.video.add_header(key, self.headers.dict[key]) self.video = urllib2.urlopen(self.video) # Sending videostream headers to client if not self.headerssent: self.send_response(self.video.getcode()) if self.video.info().dict.has_key('connection'): del self.video.info().dict['connection'] if self.video.info().dict.has_key('server'): del self.video.info().dict['server'] if self.video.info().dict.has_key('transfer-encoding'): del self.video.info().dict['transfer-encoding'] if self.video.info().dict.has_key('keep-alive'): del self.video.info().dict['keep-alive'] for key in self.video.info().dict: self.send_header(key, self.video.info().dict[key]) # End headers. Next goes video data self.end_headers() logger.debug("Headers sent") # Run proxyReadWrite self.proxyReadWrite() else: if not fmt: fmt = self.reqparams.get('fmt')[0] if self.reqparams.has_key('fmt') else None self.client.handle(shouldStart, self.url, fmt) except (aceclient.AceException, vlcclient.VlcException, urllib2.URLError) as e: logger.error("Exception: " + repr(e)) self.errorhappened = True self.dieWithError() except gevent.GreenletExit: # hangDetector told us about client disconnection pass except Exception: # Unknown exception logger.error(traceback.format_exc()) self.errorhappened = True self.dieWithError() finally: if AceConfig.videodestroydelay and not self.errorhappened and AceStuff.clientcounter.count(cid) == 1: # If no error happened and we are the only client try: logger.debug("Sleeping for " + str(AceConfig.videodestroydelay) + " seconds") gevent.sleep(AceConfig.videodestroydelay) except: pass try: remaining = AceStuff.clientcounter.delete(cid, self.client) self.client.destroy() self.ace = None self.client = None if AceConfig.vlcuse and remaining == 0: try: AceStuff.vlcclient.stopBroadcast(self.vlcid) except: pass logger.debug("END REQUEST") except: logger.error(traceback.format_exc())
import os import csv import urllib2 from selenium.common.exceptions import TimeoutException data = [] urls = [] urldata = [] with open('movie_budget_info.csv','rb') as csvfile: spamreader = csv.reader(csvfile,delimiter=',') for row in spamreader: data.append(row) for x in range(1,len(data)): title = urllib2.quote(data[x][2]) urls.append("http://www.imdb.com/find?ref_=nv_sr_fn&q={}&s=tt".format(title)) chromedriver = "/Users/sai teja/Downloads/chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver browser = webdriver.Chrome(chromedriver) #browser.get("http://www.the-numbers.com/movie/budgets/all") for url in range(4000,len(urls)): try: print(url) browser.get(urls[url]) if len(browser.find_elements_by_xpath("//table/tbody/tr"))>0 and len(browser.find_elements_by_xpath("//table/tbody/tr")[0].find_elements_by_tag_name("td"))>1 : mov_tit = browser.find_elements_by_xpath("//table/tbody/tr")[0].find_elements_by_tag_name("td")[1].find_element_by_tag_name("a").text.encode('utf-8') mov_url = browser.find_elements_by_xpath("//table/tbody/tr")[0].find_elements_by_tag_name("td")[1].find_element_by_tag_name("a").get_attribute('href') urldata.append([mov_tit,mov_url])
return except Exception, e: print e exception_write('xiaoqu_chengjiao_spider', xq_name) return content = soup.find('div', {'class': 'page-box house-lst-page-box'}) total_pages = 0 if content: d = "d=" + content.get('page-data') exec(d) total_pages = d['totalPage'] threads = [] for i in range(total_pages): url_page = u"http://bj.lianjia.com/chengjiao/pg%drs%s/" % ( i + 1, urllib2.quote(xq_name)) t = threading.Thread(target=chengjiao_spider, args=(db_cj, url_page)) threads.append(t) for t in threads: t.start() for t in threads: t.join() def do_xiaoqu_chengjiao_spider(db_xq, db_cj): """ 批量爬取小区成交记录 """ count = 0 xq_list = db_xq.fetchall() for xq in xq_list:
def browser_search(text=None, url="https://www.google.com/search?q=%s"): if not text: text = read_selected(True) url = url % quote(text) browser_open(url)
def get_positon(self, start_a, end_b, windows, query_file_path, page_folder_path): results_position_list = [] images_position_list = [] queries_lines = open(query_file_path, "r").readlines() queries = ["index"] # start from 1 for query in queries_lines: query = query.strip() queries.append(query) driver = webdriver.PhantomJS() for i in range(start_a, end_b): # the range of query if i >= len(queries): break query = queries[i] file_path = page_folder_path + query + '_sogou.html' code_file_path = page_folder_path + urllib2.quote(query, '+') + '_sogou.html' try: fin = open(file_path, 'r') fin.close() except: continue try: Results_position = [] Images_position = [] count = 0 driver.get(code_file_path) content_results = driver.find_element_by_id('main') divs = content_results.find_elements_by_css_selector('div') for div in divs: classes = div.get_attribute('class').split(' ') if 'rb' in classes or 'vrwrap' in classes or 'vrPic' in classes: count += 1 result_position = Position("query", i, query, count, div.location['x'], div.location['y'], div.size['width'], div.size['height']) Results_position.append(result_position) # anchors = div.find_elements_by_css_selector("a") images = div.find_elements_by_css_selector("img") Images = [] for image in images: anchor = image.find_element_by_xpath('..') if anchor.size['width'] == 0 or anchor.size['height'] == 0: if image.size['width'] == 0 or image.size['height'] == 0: continue image_position = Position("image", i, query, count, image.location['x'], image.location['y'], image.size['width'], image.size['height']) Images.append(image_position) else: image_position = Position("image", i, query, count, anchor.location['x'], anchor.location['y'], anchor.size['width'], anchor.size['height']) Images.append(image_position) Images_position.append(Images) if count == windows: break results_position_list.append(Results_position) images_position_list.append(Images_position) print "Sogou " + query, i except: continue driver.quit() driver.stop_client() return results_position_list, images_position_list
def main(): count = 20 method = "searcht" origstring = "" optionstr = "" filelocation = "/tmp/test.txt" test = 0 debug = 0 lang = "da" try: opts, args = getopt.getopt(sys.argv[1:], "s:g:h:m:c:dta:f:i:j:w:") for o, a in opts: if o == "-s": origstring = a searchstring = urllib2.quote(a.encode('utf8')) elif o == "-g": fromdate = a optionstr = optionstr + "&from-date=" + fromdate elif o == "-h": todate = a optionstr = optionstr + "&to-date=" + todate elif o == "-i": tag = a optionstr = optionstr + "&tag=type/" + tag elif o == "-m": method = a elif o == "-c": count = a optionstr = optionstr + "&page-size=" + count elif o == "-j": pagenumber = a optionstr = optionstr + "&page=" + pagenumber elif o == "-w": wordcount = a optionstr = optionstr + "&min-wordcount=2&max-wordcount=" + wordcount elif o == "-t": test = 1 elif o == "-d": debug = 1 elif o == "-f": filelocation = a elif o == "-l": lang = a else: assert False, "unhandled option" except getopt.GetoptError as err: print(err) sys.exit(2) baseurl = starturl + "/search?q=" + searchstring + optionstr endurl = "&show-fields=all&show-tags=all&show-factboxes=all&show-elements=all&show-references=all&show-snippets=all&api-key=" + key #endurl="&show-fields=all&show-tags=all&show-factboxes=all&show-elements=all&show-references=all&show-snippets=all&api-key=mediahackdays2014" # http://content.guardianapis.com/search?q=cameron&tag=type%2Farticle&show-tags=all&api-key=mediahackdays2014 #http://content.guardianapis.com/search?q=maria+miller&tag=type%2Fvideo&show-tags=all url = baseurl + endurl print url r = requests.get(url=url) #print json.dumps(input, sort_keys = False, indent = 4) #print json.dumps(r,sort_keys = False, indent = 4) newdata = json.loads(r.text) #wanted = {u'id',u'webTitle',u'newspaperPageNumber'} #[i for i in newdata[u'response'] if any(w in newdata for w in i[u'results'])] if debug: with open(filelocation, "w") as fh: json.dumps(r.text, fh) count = 0 #print len(newdata) if method == "timeline": for row in newdata: fh.write(newdata[count]['text']) elif method == "searcht": for k in newdata['response']['results']: data = {} res = 0 tmpID = k['id'].encode('utf8') myID = tmpID.replace('/', '_') #results = list(stories.find({'_id':myID})) cursor = stories.find({'_id': myID}) obj = next(cursor, None) #pdb.set_trace() if obj: print "OK " + str(obj) continue else: print "NOT found " + myID data['_id'] = tmpID.replace('/', '_') print "ID:" + k['id'].encode('utf8') + myID print "sectionId:" + k['sectionId'] mysectionId = k['sectionId'] data['sectionId'] = mysectionId print "sectionName:" + k['sectionName'] mysectionName = k['sectionName'] data['sectionName'] = mysectionName print "WebPublicationDate:" + k['webPublicationDate'] mytmpWebPublicationDate = k['webPublicationDate'] myWebPublicationDate = mytmpWebPublicationDate.split("T")[0] myStoryDate = myWebPublicationDate.split("-") myDate = myStoryDate[2] + " " + monthToNum( myStoryDate[1]) + " " + myStoryDate[0] data['webPublicationDate'] = myWebPublicationDate data['date'] = myDate data['displaydate'] = myDate print "WebTitle:" + k['webTitle'] myTitle = k['webTitle'] data['title'] = myTitle print "readmoreurl:" + k['webUrl'] myreadmoreurl = k['webUrl'] data['readmoreurl'] = myreadmoreurl print "TRAIL:" + k['fields']['trailText'] myTrail = k['fields']['trailText'] data['Trail'] = myTrail print "headline:" + k['fields']['headline'] myheadline = k['fields']['headline'] data['headline'] = myheadline try: print "byline:" + k['fields']['byline'] mycaption = k['fields']['byline'] #data['byline']=mycaption data['byline'] = mycaption data['caption'] = origstring except: print "ups on caption .." #data['caption']="lorem ipsum" try: print "wordcount:" + k['fields']['wordcount'] mywordcount = k['fields']['wordcount'] data['wordcount'] = mywordcount except: print "ups on wordcount .." data['wordcount'] = 0 try: print "photourl:" + k['fields']['thumbnail'] mythumbnail = k['fields']['thumbnail'] data['photourl'] = mythumbnail except: print "No photo on wordcount .." data['photourl'] = "" #print "BODY:" + k['fields']['body'] myBody = k['fields']['body'] #print "BODY:" + k['fields']['body'] myBody = k['fields']['body'] # clean up for timeline timeBody = doClean(myBody) data['FullBody'] = myBody data['body'] = timeBody try: print "PAGENO:" + k['fields']['newspaperPageNumber'] myPageNo = k['fields']['newspaperPageNumber'] data['newspaperPageNumber'] = myPageNo except: print "ups on newpaper .." data['newspaperPageNumber'] = 0 try: print "DATE:" + k['fields']['newspaperEditionDate'] myDate = k['fields']['newspaperEditionDate'] data['newspaperEditionDate'] = myDate #data['date']=myDate except: print "ups on eddate .." data['newspaperEditionDate'] = "1970-01-01" #data['date']="1970-01-01" tmpStr = "" for j in k['tags']: tmpStr = tmpStr + "," + j['webTitle'] print "--> " + tmpStr data['Tags'] = tmpStr if test: print "Just testing ..." #stories.update(data, upsert=True) else: print "inserting data ..." stories.insert(data)
except HTTPError, error: geocode_status_code = error.code if geocode_status_code == 200: geocode_json = json.load(geocode_service_call) lat = geocode_json["lat"] long = geocode_json["lng"] else: return index(errors=["Invalid Address - Please check, renter and try again."]) title = "Weather App" header = "Weather for {2} ({0}, {1})".format(lat, long, address) #Build weather url and make call weather_service_url = quote("https://arcuschallenge-getweather.appspot.com/getweather/{0},{1}".format(lat,long), ':/?&=,') try: weather_service_call = urlopen(weather_service_url) weather_status_code = weather_service_call.getcode() except HTTPError, error: weather_status_code = error.code error = error.reason #Check weather call status code if weather_status_code == 200: weather_json = json.load(weather_service_call) content = [] #Loop through each day weather and take variables that we want to display for each_day in weather_json:
def set_online_followed(value): window.setProperty(key='%s-online_followers' % kodi.get_id(), value=quote(str(value)))
def test_valid_queries(self): # Set protector to unsafe mode protector = Protector(["prevent_delete"], [], False) self.assertTrue( protector.check(quote("select * from bla where x=y")).is_ok())
import scraperwiki import simplejson import urllib2 import sys # Needs to be in lower case SCREENNAME = 'easternamigo' # API help: https://dev.twitter.com/docs/api/1/get/followers/ids url = 'http://api.twitter.com/1/followers/ids.json?screen_name=%s' % ( urllib2.quote(SCREENNAME)) print url followers_json = simplejson.loads(scraperwiki.scrape(url)) print "Found %d followers of %s" % (len(followers_json), SCREENNAME) followers_json = followers_json[ 'ids'] # get earliest followers first for batching followers_json.reverse() # Groups a list in chunks of a given size def group(lst, n): for i in range(0, len(lst), n): val = lst[i:i + n] if len(val) == n: yield tuple(val) # Where to start? Overlap one batch to increase hit rate if people unfollow etc. batchdone = scraperwiki.sqlite.get_var('batchdone', 1) batchstart = batchdone - 1
def _fetch_builder_page(self, builder): buildbot_url = config_urls.chromium_buildbot_url('chromium.webkit') builder_page_url = "%s/builders/%s?numbuilds=100" % ( buildbot_url, urllib2.quote(builder.name())) return urllib2.urlopen(builder_page_url)
def search(self, owner=None, reviewer=None, base=None, closed=None, private=None, commit=None, created_before=None, created_after=None, modified_before=None, modified_after=None, per_request=None, keys_only=False, with_messages=False): """Yields search results.""" # These are expected to be strings. string_keys = { 'owner': owner, 'reviewer': reviewer, 'base': base, 'created_before': created_before, 'created_after': created_after, 'modified_before': modified_before, 'modified_after': modified_after, } # These are either None, False or True. three_state_keys = { 'closed': closed, 'private': private, 'commit': commit, } url = '/search?format=json' # Sort the keys mainly to ease testing. for key in sorted(string_keys): value = string_keys[key] if value: url += '&%s=%s' % (key, urllib2.quote(value)) for key in sorted(three_state_keys): value = three_state_keys[key] if value is not None: url += '&%s=%s' % (key, value) if keys_only: url += '&keys_only=True' if with_messages: url += '&with_messages=True' if per_request: url += '&limit=%d' % per_request cursor = '' while True: output = self.get(url + cursor) if output.startswith('<'): # It's an error message. Return as no result. break data = json.loads(output) or {} if not data.get('results'): break for i in data['results']: yield i cursor = '&cursor=%s' % data['cursor']
def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None): '''Paginate through github's web interface and scrape summaries''' # repo_url - https://github.com/ansible/ansible for example # baseurl - an entrypoint for one-off utils to scrape specific issue # query urls. NOTE: this disables writing a cache # get cached if not baseurl: issues = self.load_summaries(repo_url) else: issues = {} if not baseurl: url = repo_url url += '/issues' url += '?' url += 'q=' url += urllib2.quote('sort:updated-desc') else: url = baseurl rr = self._request_url(url) soup = BeautifulSoup(rr.text, 'html.parser') data = self._parse_issue_summary_page(soup) if data['issues']: issues.update(data['issues']) if not baseurl: self.dump_summaries_tmp(repo_url, issues) while data['next_page']: rr = self._request_url(self.baseurl + data['next_page']) soup = BeautifulSoup(rr.text, 'html.parser') data = self._parse_issue_summary_page(soup) if not data['next_page'] or not data['issues']: break changed = [] changes = False for k, v in data['issues'].iteritems(): #v['href'] = self.baseurl + v['href'] if str(k) not in issues: changed.append(str(v['number'])) changes = True elif v != issues[str(k)]: changed.append(str(v['number'])) changes = True issues[str(k)] = v if changed: logging.info('changed: %s' % ','.join(x for x in changed)) if not baseurl: self.dump_summaries_tmp(repo_url, issues) if not changes: break # save the cache if not baseurl: self.dump_summaries(repo_url, issues) return issues
def search(request): title = '搜索结果' param = request.GET print param keys = param.keys() if 'page' not in keys: page = 1 else: page = int(param['page']) if 'num' not in keys: num = 10 else: num = int(param['num']) if 'q' not in keys: return render_to_response("search_index.html") else: q = param['q'].encode('utf-8') # print q, page, num start = str((page - 1) * num) rows = str(num) q = urllib2.quote(q) uri = "http://127.0.0.1:8983/solr/article_core/select?q=" + q + "&start=" + start + "&rows=" + rows + "&wt=json&indent=true" req = urllib2.Request(uri) res = urllib2.urlopen(req).read() res = json.loads(res) responseHeader = res['responseHeader'] response = res['response'] num_found = response['numFound'] articles = response['docs'] if num_found > 250: num_found = 250; articles = articles[:250] # 处理类别 for article in articles: if article['type'] == 1: article['category'] = 'zcfb' article['category_name'] = webConfig.TOPLABEL1 elif article['type'] == 2: article['category'] = 'gsgg' article['category_name'] = webConfig.TOPLABEL2 elif article['type'] == 3: article['category'] = 'lddt' article['category_name'] = webConfig.TOPLABEL3 elif article['type'] == 4: article['category'] = 'hydt' article['category_name'] = webConfig.TOPLABEL4 elif article['type'] == 5: article['category'] = 'dfdt' article['category_name'] = webConfig.TOPLABEL5 elif article['type'] == 0: article['category'] = 'qtwz' article['category_name'] = webConfig.TOPLABEL6 else: article['category'] = 'index' article['category_name'] = webConfig.TOPLABEL0 if len(article['content']) > 200: article['desc'] = article['content'][:200] + "......" else: article['desc'] = article['content'] # print articles # 页数导航 pages = [] page_num = (int(num_found) - 1) / num + 1 temp = page - page % 5 if page_num <= 5: for i in range(page_num): pages.append(i + 1) elif page > page_num - page_num % 5: for i in range(page_num - page_num % 5, page_num): pages.append(i + 1) else: for i in range(5): pages.append(i + 1 + temp) #前一页,后一页 if page == 1: pre_page = page next_page = page + 1 elif page == page_num: pre_page = page - 1 next_page = page_num else: pre_page = page - 1 next_page = page + 1 return render_to_response("search.html", { "title": title, 'articles': articles, 'project_name': webConfig.PROJECTNAME, 'toplabel0': webConfig.TOPLABEL0, 'toplabel1': webConfig.TOPLABEL1, 'toplabel2': webConfig.TOPLABEL2, 'toplabel3': webConfig.TOPLABEL3, 'toplabel4': webConfig.TOPLABEL4, 'toplabel5': webConfig.TOPLABEL5, 'toplabel6': webConfig.TOPLABEL6, 'page': page, 'num': num, 'query': q, 'num_found': num_found, 'responseHeader': responseHeader, 'page_num': page_num, 'pages': pages, 'pre_page': pre_page, 'next_page': next_page, } )
def fetch_mldata(dataname, target_name='label', data_name='data', transpose_data=True, data_home=None): """Fetch an mldata.org data set If the file does not exist yet, it is downloaded from mldata.org . mldata.org does not have an enforced convention for storing data or naming the columns in a data set. The default behavior of this function works well with the most common cases: 1) data values are stored in the column 'data', and target values in the column 'label' 2) alternatively, the first column stores target values, and the second data values 3) the data array is stored as `n_features x n_samples` , and thus needs to be transposed to match the `sklearn` standard Keyword arguments allow to adapt these defaults to specific data sets (see parameters `target_name`, `data_name`, `transpose_data`, and the examples below). mldata.org data sets may have multiple columns, which are stored in the Bunch object with their original name. Parameters ---------- dataname : Name of the data set on mldata.org, e.g.: "leukemia", "Whistler Daily Snowfall", etc. The raw name is automatically converted to a mldata.org URL . target_name : optional, default: 'label' Name or index of the column containing the target values. data_name : optional, default: 'data' Name or index of the column containing the data. transpose_data : optional, default: True If True, transpose the downloaded data array. data_home : optional, default: None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: 'data', the data to learn, 'target', the classification labels, 'DESCR', the full description of the dataset, and 'COL_NAMES', the original names of the dataset columns. Examples -------- Load the 'iris' dataset from mldata.org: >>> from sklearn.datasets.mldata import fetch_mldata >>> import tempfile >>> test_data_home = tempfile.mkdtemp() >>> iris = fetch_mldata('iris', data_home=test_data_home) >>> iris.target.shape (150,) >>> iris.data.shape (150, 4) Load the 'leukemia' dataset from mldata.org, which needs to be transposed to respects the scikit-learn axes convention: >>> leuk = fetch_mldata('leukemia', transpose_data=True, ... data_home=test_data_home) >>> leuk.data.shape (72, 7129) Load an alternative 'iris' dataset, which has different names for the columns: >>> iris2 = fetch_mldata('datasets-UCI iris', target_name=1, ... data_name=0, data_home=test_data_home) >>> iris3 = fetch_mldata('datasets-UCI iris', ... target_name='class', data_name='double0', ... data_home=test_data_home) >>> import shutil >>> shutil.rmtree(test_data_home) """ # normalize dataset name dataname = mldata_filename(dataname) # check if this data set has been already downloaded data_home = get_data_home(data_home=data_home) data_home = join(data_home, 'mldata') if not exists(data_home): os.makedirs(data_home) matlab_name = dataname + '.mat' filename = join(data_home, matlab_name) # if the file does not exist, download it if not exists(filename): urlname = MLDATA_BASE_URL % quote(dataname) try: mldata_url = urlopen(urlname) except HTTPError as e: if e.code == 404: e.msg = "Dataset '%s' not found on mldata.org." % dataname raise # store Matlab file try: with open(filename, 'w+b') as matlab_file: copyfileobj(mldata_url, matlab_file) except: os.remove(filename) raise mldata_url.close() # load dataset matlab file with open(filename, 'rb') as matlab_file: matlab_dict = io.loadmat(matlab_file, struct_as_record=True) # -- extract data from matlab_dict # flatten column names col_names = [str(descr[0]) for descr in matlab_dict['mldata_descr_ordering'][0]] # if target or data names are indices, transform then into names if isinstance(target_name, numbers.Integral): target_name = col_names[target_name] if isinstance(data_name, numbers.Integral): data_name = col_names[data_name] # rules for making sense of the mldata.org data format # (earlier ones have priority): # 1) there is only one array => it is "data" # 2) there are multiple arrays # a) copy all columns in the bunch, using their column name # b) if there is a column called `target_name`, set "target" to it, # otherwise set "target" to first column # c) if there is a column called `data_name`, set "data" to it, # otherwise set "data" to second column dataset = {'DESCR': 'mldata.org dataset: %s' % dataname, 'COL_NAMES': col_names} # 1) there is only one array => it is considered data if len(col_names) == 1: data_name = col_names[0] dataset['data'] = matlab_dict[data_name] # 2) there are multiple arrays else: for name in col_names: dataset[name] = matlab_dict[name] if target_name in col_names: del dataset[target_name] dataset['target'] = matlab_dict[target_name] else: del dataset[col_names[0]] dataset['target'] = matlab_dict[col_names[0]] if data_name in col_names: del dataset[data_name] dataset['data'] = matlab_dict[data_name] else: del dataset[col_names[1]] dataset['data'] = matlab_dict[col_names[1]] # set axes to scikit-learn conventions if transpose_data: dataset['data'] = dataset['data'].T if 'target' in dataset: if not sp.sparse.issparse(dataset['target']): dataset['target'] = dataset['target'].squeeze() return Bunch(**dataset)
def get_acquire_url(self): site = self._get_site() offering_id = urllib2.quote(self.owner_organization.name + '/' + self.name + '/' + self.version) return urljoin(site, 'offering/' + offering_id)
def updateship(cmdr, shipid, shiptype, props=[]): if shipid is not None and shiptype: args = '&shipId=%d&type=%s' % (shipid, shiptype) for (slot, thing) in props: args += '&%s=%s' % (slot, urllib2.quote(unicode(thing))) call(cmdr, 'api-commander-v1/update-ship', args)
global all_filter_totals all_filter_totals += 1 print("==========================第%s页采集结束================\n" % (page_num)) if savefile == 1: #logfile.write("==========================第"+page_num+"页采集结束================\n") logfile.close() if __name__ == '__main__': #Get the start time starttime = datetime.datetime.now() show_logo() key = raw_input('\033[1;33;40mplease input keyword:') key = key.encode('utf-8') key = urllib2.quote(key) page = int(raw_input("Search Number of pages:")) for i in range(page): page_pn = (i * baidu_page_size) baidu_search(key, page_pn) #Get the end time endtime = datetime.datetime.now() runtime = (endtime - starttime).seconds print( "\033[1;36;40m%d found | %d checked | %d filter | %d delete The program runs in %s seconds\033[1;37;40m" % (all_totals, all_checked_totals, all_filter_totals, all_delete_totals, runtime))