class MetricServiceRequest(object): """ A convience class for fetching metrics from CentralQuery that can be used by twisted daemons. """ # use a shared cookie jar so all Metric requests can share the same session cookieJar = CookieJar() def __init__(self, userAgent): self._aggMapping = AGGREGATION_MAPPING urlstart = getGlobalConfiguration().get('metric-url', 'http://localhost:8080') self._metric_url = '%s/%s' % (urlstart, METRIC_URL_PATH) self._metric_url_v2 = '%s/%s' % (urlstart, WILDCARD_URL_PATH) creds = IAuthorizationTool(None).extractGlobalConfCredentials() auth = base64.b64encode('{login}:{password}'.format(**creds)) self.agent = CookieAgent( Agent(reactor, pool=getPool(), connectTimeout=30), self.cookieJar) self._headers = Headers({ 'Authorization': ['basic %s' % auth], 'content-type': ['application/json'], 'User-Agent': ['Zenoss: %s' % userAgent] }) self.onMetricsFetched = None def getMetrics(self, uuid, dpNames, cf='AVERAGE', rate=False, downsample="1h-avg", start=None, end=None, deviceId=None, returnSet="EXACT"): metrics = [] if isinstance(dpNames, basestring): dpNames = [dpNames] for dpName in dpNames: # TODO find callers name = ensure_prefix(deviceId, dpName) metrics.append( dict(metric=name, aggregator=self._aggMapping.get(cf.lower(), cf.lower()), rpn='', rate=rate, format='%.2lf', tags=dict(contextUUID=[uuid]), name='%s_%s' % (uuid, dpName))) request = dict(returnset=returnSet, start=start, end=end, downsample=downsample, metrics=metrics) body = FileBodyProducer(StringIO(json.dumps(request))) d = self.agent.request('POST', self._metric_url, self._headers, body) return d def fetchMetrics(self, metrics, start="1h-ago", end=None, returnSet="EXACT"): """ Uses the CentralQuery V2 api to fetch metrics. Mainly that means wild cards can be used to fetch all metrics with the same name grouped by a tag. Usually used to retrieve a specific metric for all component on a device :param metrics: dictionary with required keys of metricName, tags and optional rpn defaults to empty, cf defatults to average, rate defaults to false, downsample defaults to 5m-avg :param start: :param end: :param returnSet: :return: deferred """ metricQueries = [] for metric in metrics: log.info("fetchMetrics metrics %s", metric) cf = metric.get('cf', 'average') rpn = metric.get('rpn', '') rate = metric.get('rate', False) tags = metric['tags'] downsample = metric.get('downsample', '5m-avg') metricName = metric['metricName'] metricQueries.append( dict(metric=metricName, downsample=downsample, aggregator=self._aggMapping.get(cf.lower(), cf.lower()), rpn=rpn, rate=rate, format='%.2lf', tags=tags, name=metricName)) request = dict(returnset=returnSet, start=start, end=end, downsample=downsample, queries=metricQueries) body = FileBodyProducer(StringIO(json.dumps(request))) log.info("POST %s %s %s", self._metric_url_v2, self._headers, json.dumps(request)) d = self.agent.request('POST', self._metric_url_v2, self._headers, body) return d
count=200) if len(results) > 0: for tweet in results: if len(tweet['entities']['urls']) >= 1: url = tweet['entities']['urls'][0]['url'] txt = spliter.splitCleanTweet2Sents(tweet['text']) original_sentence = '' for sentence in txt: original_sentence = original_sentence + sentence + ' ' if len(original_sentence.split()) <= 6: continue #print tweet['text'] #print url try: time.sleep(5) cj = CookieJar() opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cj)) result = opener.open(url) real_url = result.geturl() if real_url.find('.html') != -1: real_url = real_url[:real_url.find('.html') + 5] #elif real_url.find('&')!=-1: # real_url=real_url[:real_url.find('&')] #elif real_url.find('?')!=-1: # real_url=real_url[:real_url.find('?')] #print real_url if real_url in real_urls: continue query = twitter.search.tweets(q=real_url, lang="en",
def request(self): self.response_text = self._DEFAULT_RESPONSE if not self.exceptions: self.exceptions = Exception if not self.wait: self.wait = time.sleep if not self.headers: self.headers = {} for i in xrange(self.tries): self.current_tries = i + 1 if self.before_request: self.before_request(self) if self.cancel_operation and self.cancel_operation(): break request_report = 'Request URL: ' + self.get_url_for_report(self.url) request_report += '\nRequest data: ' + Utils.str(self.data) request_report += '\nRequest headers: ' + Utils.str(self.get_headers_for_report(self.headers)) response_report = '<response_not_set>' response = None rex = None try: Logger.debug(request_report) req = urllib2.Request(self.url, self.data, self.headers) response = urllib2.urlopen(req) self.response_code = response.getcode() self.response_info = response.info() self.response_url = response.geturl() cookiejar = CookieJar() cookiejar._policy._now = cookiejar._now = int(time.time()) self.response_cookies = cookiejar.make_cookies(response, req) if self.read_content: self.response_text = response.read() content_length = self.response_info.getheader('content-length', -1) response_report = '\nResponse Headers:\n%s' % Utils.str(self.response_info) response_report += '\nResponse (%d) content-length=%s, len=<%s>:\n%s' % (self.response_code, content_length, len(self.response_text), self.response_text) self.success = True break except self.exceptions as e: Logger.debug('Exception...') root_exception = e response_report = '\nResponse <Exception>: ' if isinstance(e, urllib2.HTTPError): self.response_text = Utils.str(e.read()) response_report += self.response_text else: response_report += Utils.str(e) rex = RequestException(Utils.str(e), root_exception, request_report, response_report) finally: Logger.debug(response_report) if response: response.close() if rex: if self.on_exception: Logger.debug('calling self.on_exception...') self.on_exception(self, rex) if self.cancel_operation and self.cancel_operation(): break Logger.debug('current_tries: ' + str(self.current_tries) + ' maximum tries: ' + str(self.tries) + ' i: ' + str(i)) if self.current_tries == self.tries: Logger.debug('max retries reached') if self.on_failure: self.on_failure(self) if self.on_complete: self.on_complete(self) Logger.debug('Raising exception...') raise rex current_time = time.time() max_waiting_time = current_time + self.current_delay Logger.debug('current_delay: ' + str(self.current_delay) + ' seconds. Waiting...') while (not self.cancel_operation or not self.cancel_operation()) and max_waiting_time > current_time: remaining = round(max_waiting_time-current_time) if self.waiting_retry: Logger.debug('calling self.waiting_retry...') self.waiting_retry(self, remaining) self.wait(1) current_time = time.time() Logger.debug('Done waiting.') self.current_delay *= self.backoff if self.success and self.on_success: self.on_success(self) if self.on_complete: self.on_complete(self) return self.response_text
def __init__(self, uuid, token, cert_file): agent = Agent(uuid, token, cert_file) jar = CookieJar() self._agent = CookieAgent(agent, jar) super(self.__class__, self).__init__(self._agent)
def test_cookiejar(): with wsgiserver(set_cookie()): useragent = UserAgent(cookiejar=CookieJar()) assert b"" == useragent.urlopen('http://127.0.0.1:54323/').read()
#coding:utf8 import urllib2 import random import json import re from cookielib import CookieJar from pyquery import PyQuery as pq # cookiejar to help deal with cookie cj_iut = CookieJar() opener_iut = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj_iut)) # result file website_result_file = '../../result/iresearch/iut_service_traffic.txt' domain_result_file = '../../result/iresearch/iut_service_domain_traffic.txt' # regular expressions to analyse response p_month = re.compile(r'dtListM\[\d+]\[3]="(\d{4}-\d{2})"') p_category_traffic = re.compile(r'iut_data = (\[.*]);') p_category_title = re.compile(r'iut_title =(\[[\S\s]*?]);') p_main_category = re.compile(r'selected >(.*?)</option>') p_page = re.compile(r'</select> /(\d*)(?=</td>)') p_category = re.compile(r'<option value="(\d*)" \s*>.*</option>') p_login = re.compile(r'您目前尚未登录或者登录已超时') # category and date to scrape month_period = [] categories = [] # to keep track of service traffic running state
def __init__(self, email, password): self.email = email self.password = password self.cj = CookieJar()
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from os import name from cookielib import CookieJar from lib.settings import W, BW, R, G, O, B, P, C, GR cookie_handler = CookieJar() def color(text, color=GR): """ Sets the text to a given color if not running under Windows. """ if name == "nt": return text else: return "%s%s%s" % (color, text, W)
def send_request(url, method, data, args, params, headers, cookies, timeout, is_json, verify_cert): """ Forge and send HTTP request. """ ## Parse url args for p in args: url = url.replace(':' + p, str(args[p])) try: if data: if is_json: headers['Content-Type'] = 'application/json' data = json.dumps(data) request = requests.Request(method.upper(), url, data=data, params=params, headers=headers, cookies=cookies) else: request = requests.Request(method.upper(), url, params=params, headers=headers, cookies=cookies) ## Prepare and send HTTP request. session = requests.Session() session.verify = verify_cert r = session.send(request.prepare(), timeout=timeout) session.close() except requests.exceptions.Timeout: return { 'data': {}, 'cookies': CookieJar(), 'content_type': '', 'status': 0, 'is_json': False, 'timeout': True } try: content_type = r.headers.get('Content-Type', 'application/json') response = r.json() isjson = True except json.decoder.JSONDecodeError: content_type = r.headers.get('Content-Type', 'text/html') response = r.text isjson = False return { 'data': response, 'cookies': r.cookies, 'content_type': content_type, 'status': r.status_code, 'is_json': isjson, 'timeout': False }
RATING_PROPER = 1 RATING_NUKED = 2 CODEC_UNKNOWN = 0 CODEC_XVID = 1 CODEC_H264 = 2 CODEC_MP3 = 3 CODEC_AAC = 4 CODEC_AC3 = 5 CODEC_DTS = 6 CODEC_DTSHD = 7 CODEC_DTSHDMA = 8 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.66 Safari/537.36" COOKIE_JAR = CookieJar() urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor(COOKIE_JAR))) class closing(object): def __init__(self, thing): self.thing = thing def __enter__(self): return self.thing def __exit__(self, *exc_info): self.thing.close() def parse_json(data):
def init(self): self._client = urllib2.build_opener( urllib2.HTTPCookieProcessor(CookieJar())) self.url = self.resolve_file_url(self._resolver_class, self._url) if not self.url: raise HTTPLoader.Error('Url was not resolved to file link')
def wsgi_app(monkeypatch, recreate_openapi_spec): wsgi_callable = make_app() cookies = CookieJar() return WebTestAppForCMK(wsgi_callable, cookiejar=cookies)
def module_run(self, domains): base_url = 'https://www.bing.com/search' cnt = 0 new = 0 for domain in domains: self.heading(domain, level=0) base_query = 'domain:' + domain pattern = '"b_algo"><h2><a href="(?:\w*://)*(\S+?)\.%s[^"]*"' % ( domain) subs = [] # control variables new = True page = 0 nr = 50 cookiejar = CookieJar() cookiejar.set_cookie( self.make_cookie('SRCHHPGUSR', 'NEWWND=0&NRSLT=%d&SRCHLANG=&AS=1' % (nr), '.bing.com')) # execute search engine queries and scrape results storing subdomains in a list # loop until no new subdomains are found while new == True: content = None query = '' # build query based on results of previous results for sub in subs: query += ' -domain:%s.%s' % (sub, domain) full_query = base_query + query url = '%s?first=%d&q=%s' % (base_url, (page * nr), urllib.quote_plus(full_query)) # bing errors out at > 2059 characters not including the protocol if len(url) > 2066: url = url[:2066] self.verbose('URL: %s' % (url)) # send query to search engine resp = self.request(url, cookiejar=cookiejar) if resp.status_code != 200: self.alert( 'Bing has encountered an error. Please submit an issue for debugging.' ) break content = resp.text sites = re.findall(pattern, content) # create a unique list sites = list(set(sites)) new = False # add subdomain to list if not already exists for site in sites: if site not in subs: subs.append(site) new = True host = '%s.%s' % (site, domain) self.output('%s' % (host)) new += self.add_hosts(host) if not new: # exit if all subdomains have been found if not '>Next</a>' in content: break else: page += 1 self.verbose( 'No New Subdomains Found on the Current Page. Jumping to Result %d.' % ((page * nr) + 1)) new = True # sleep script to avoid lock-out self.verbose('Sleeping to avoid lockout...') time.sleep(random.randint(5, 15)) cnt += len(subs) self.summarize(new, cnt)
def reset(self): self._cookie_jar = CookieJar() self._opener = build_opener(NoRedirectionProcessor, HTTPCookieProcessor(self._cookie_jar))
return False else: return True # Import user created settings. This will override built-in settings if defined. if module_exists("config"): import config else: print( "Please set up the config.py file. Copy 'sample.config.py' to 'config.py' and set up options" ) sys.exit(2) # Initialize cookie jar and session cookies = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies)) print("Load login page") ### Load the login page. This will initialize some cookies. Save them. login_page = opener.open( 'https://disneyworld.disney.go.com/login/?returnUrl=https://mydisneyphotopass.disney.go.com/' ) # cookies are automatically saved. # grab the unique CSRF key. parse it. csrf_key = re.search('id="pep_csrf" value=".*"', login_page.read()) csrf_key = csrf_key.group(0) csrf_key = string.split(csrf_key, "\"") # split on double quote. easiest way. csrf_key = csrf_key[
def wang(self, value): try: print '\n***** ' + self.baseUrl + ' *****' # 中断打开链接 if self.isClose: return # proxy = urllib2.ProxyHandler({'http': '' + ip + ''}) # opener = urllib2.build_opener(proxy) cj = CookieJar() cookieHandle = urllib2.HTTPCookieProcessor(cj) opener = urllib2.build_opener(cookieHandle) opener.addheaders = [ # ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'), # ('Accept-Encoding', 'gzip, deflate, sdch'), # ('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6'), # ('Cache-Control', 'max-age=0'), # ('Connection', 'keep-alive'), # ('DNT', '1'), # ('Upgrade-Insecure-Requests', '1'), # ('Host', 'www.laifudao.com'), # ('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36') ] url = self.baseUrl + str(value) + self.baseUrl2 print url o = opener.open(url, timeout=10) d = o.read() pattern = re.compile( '<header class="post-header">(.*?)</a></h1>.*?title="(.*?)".*?<time>(.*?)</time>.*?<span class="cats">.*?>(.*?)</a>.*?"article-content">(.*?)</section>', re.S) results = re.findall(pattern, d) for i in results: # 中断打开链接 if self.isClose: return title = self.tool.replace(i[0]) author = self.tool.replace(i[1]) online = self.tool.replace(i[2]) type = self.tool.replace(i[3]) content = self.tool.replace(i[4]) # 比较数据库最新一条数据,如果相同则跳出 for old in self.oldDatas: # 数据库查询出来的是unicode编码,要转成utf-8 o = old[0].encode('utf-8') if title == o: # 通知中断 self.isClose = True return # 插入新数据 time.sleep(0.1) sql = "insert into `lf_wangwen` (`pid`, `title`, `content`, `online_time`, `author`, `type`, `create_by`, `update_by`, `create_time`, `update_time`, `status`) values (uuid(), '" + title + "','" + content + "','" + online + "','" + author + "','" + type + "','admin','admin',now(),now(),0);" print sql self.db.insertDB(sql) except urllib2.HTTPError, e: print 'HTTPError: ' + str(e.code) return False
def gesdisc_merra_sync(DIRECTORY, YEARS, USER='', PASSWORD='', LOG=False, LIST=False, MODE=None, CLOBBER=False): #-- recursively create directory if non-existent os.makedirs(DIRECTORY, MODE) if not os.path.exists(DIRECTORY) else None #-- create log file with list of synchronized files (or print to terminal) if LOG: #-- format: NASA_GESDISC_MERRA2_sync_2002-04-01.log today = time.strftime('%Y-%m-%d', time.localtime()) LOGFILE = 'NASA_GESDISC_MERRA2_sync_{0}.log'.format(today) fid = open(os.path.join(DIRECTORY, LOGFILE), 'w') print('NASA MERRA-2 Sync Log ({0})'.format(today), file=fid) else: #-- standard output (terminal output) fid = sys.stdout #-- https://docs.python.org/3/howto/urllib2.html#id5 #-- create a password manager password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() #-- Add the username and password for NASA Earthdata Login system password_mgr.add_password(None, 'https://urs.earthdata.nasa.gov', USER, PASSWORD) #-- compile HTML parser for lxml parser = lxml.etree.HTMLParser() #-- Create cookie jar for storing cookies. This is used to store and return #-- the session cookie given to use by the data server (otherwise will just #-- keep sending us back to Earthdata Login to authenticate). cookie_jar = CookieJar() #-- create "opener" (OpenerDirector instance) opener = urllib2.build_opener( urllib2.HTTPBasicAuthHandler(password_mgr), urllib2.HTTPSHandler(context=ssl.SSLContext()), urllib2.HTTPCookieProcessor(cookie_jar)) #-- Now all calls to urllib2.urlopen use our opener. urllib2.install_opener(opener) #-- All calls to urllib2.urlopen will now use handler #-- Make sure not to include the protocol in with the URL, or #-- HTTPPasswordMgrWithDefaultRealm will be confused. #-- MERRA-2 data remote base directory HOST = posixpath.join('http://goldsmr4.gesdisc.eosdis.nasa.gov', 'data', 'MERRA2_MONTHLY') #-- compile regular expression operator for years to sync regex_pattern = '|'.join('{0:d}'.format(y) for y in YEARS) R1 = re.compile('({0})'.format(regex_pattern), re.VERBOSE) #-- compile regular expression operator to find MERRA2 files R2 = re.compile('MERRA2_(.*?).nc4(.xml)?', re.VERBOSE) #-- for each MERRA-2 product to sync for PRODUCT in ['M2TMNXINT.5.12.4', 'M2TMNXGLC.5.12.4']: print('PRODUCT={0}'.format(PRODUCT), file=fid) #-- open connection with GESDISC server at remote directory req = urllib2.Request(url=posixpath.join(HOST, PRODUCT)) #-- read and parse request for subdirectories (find column names) tree = lxml.etree.parse(urllib2.urlopen(req), parser) colnames = tree.xpath('//tr/td[not(@*)]//a/@href') #-- find remote yearly directories for PRODUCT remote_sub = [sd for sd in colnames if R1.match(sd)] for Y in remote_sub: #-- check if local directory exists and recursively create if not if (not os.access(os.path.join(DIRECTORY, PRODUCT, Y), os.F_OK)): os.makedirs(os.path.join(DIRECTORY, PRODUCT, Y), MODE) #-- open connection with GESDISC server at remote directory req = urllib2.Request(url=posixpath.join(HOST, PRODUCT, Y)) #-- read and parse request for files (find names and modified dates) tree = lxml.etree.parse(urllib2.urlopen(req), parser) colnames = tree.xpath('//tr/td[not(@*)]//a/@href') collastmod = tree.xpath('//tr/td[@align="right"][1]/text()') #-- find remote files for PRODUCT and YEAR remote_file_lines = [ i for i, f in enumerate(colnames) if R2.match(f) ] for i in remote_file_lines: #-- local and remote versions of the file FILE = colnames[i] local_file = os.path.join(DIRECTORY, PRODUCT, Y, FILE) remote_file = posixpath.join(HOST, PRODUCT, Y, FILE) #-- get last modified date of file and convert into unix time file_date = time.strptime(collastmod[i].rstrip(), '%d-%b-%Y %H:%M') remote_mtime = calendar.timegm(file_date) #-- copy file from remote directory checking modification times http_pull_file(fid, remote_file, remote_mtime, local_file, LIST, CLOBBER, MODE) #-- close request req = None #-- close log file and set permissions level to MODE if LOG: fid.close() os.chmod(os.path.join(DIRECTORY, LOGFILE), MODE)
def nsidc_icesat2_sync(ddir, PRODUCTS, RELEASE, VERSIONS, GRANULES, TRACKS, USER='', PASSWORD='', YEARS=None, SUBDIRECTORY=None, AUXILIARY=False, FLATTEN=False, LOG=False, LIST=False, MODE=None, CLOBBER=False): #-- check if directory exists and recursively create if not os.makedirs(ddir, MODE) if not os.path.exists(ddir) else None #-- output of synchronized files if LOG: #-- format: NSIDC_IceBridge_sync_2002-04-01.log today = time.strftime('%Y-%m-%d', time.localtime()) LOGFILE = 'NSIDC_IceSat-2_sync_{0}.log'.format(today) fid = open(os.path.join(ddir, LOGFILE), 'w') print('ICESat-2 Data Sync Log ({0})'.format(today), file=fid) else: #-- standard output (terminal output) fid = sys.stdout #-- https://docs.python.org/3/howto/urllib2.html#id5 #-- create a password manager password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() #-- Add the username and password for NASA Earthdata Login system password_mgr.add_password(None, 'https://urs.earthdata.nasa.gov', USER, PASSWORD) #-- Encode username/password for request authorization headers base64_string = base64.b64encode('{0}:{1}'.format(USER, PASSWORD).encode()) #-- compile HTML parser for lxml parser = lxml.etree.HTMLParser() #-- Create cookie jar for storing cookies. This is used to store and return #-- the session cookie given to use by the data server (otherwise will just #-- keep sending us back to Earthdata Login to authenticate). cookie_jar = CookieJar() #-- create "opener" (OpenerDirector instance) opener = urllib2.build_opener( urllib2.HTTPBasicAuthHandler(password_mgr), urllib2.HTTPSHandler(context=ssl.SSLContext()), urllib2.HTTPCookieProcessor(cookie_jar)) #-- add Authorization header to opener authorization_header = "Basic {0}".format(base64_string.decode()) opener.addheaders = [("Authorization", authorization_header)] #-- Now all calls to urllib2.urlopen use our opener. urllib2.install_opener(opener) #-- All calls to urllib2.urlopen will now use handler #-- Make sure not to include the protocol in with the URL, or #-- HTTPPasswordMgrWithDefaultRealm will be confused. #-- remote https server for ICESat-2 Data HOST = 'https://n5eil01u.ecs.nsidc.org' #-- regular expression operator for finding files of a particular granule #-- find ICESat-2 HDF5 files in the subdirectory for product and release regex_track = '|'.join(['{0:04d}'.format(T) for T in TRACKS]) regex_granule = '|'.join(['{0:02d}'.format(G) for G in GRANULES]) regex_version = '|'.join(['{0:02d}'.format(V) for V in VERSIONS]) regex_suffix = '(.*?)' if AUXILIARY else '(h5)' remote_regex_pattern = ( '{0}(-\d{{2}})?_(\d{{4}})(\d{{2}})(\d{{2}})(\d{{2}})' '(\d{{2}})(\d{{2}})_({1})(\d{{2}})({2})_({3})_({4})(.*?).{5}$') #-- regular expression operator for finding subdirectories if SUBDIRECTORY: #-- Sync particular subdirectories for product R2 = re.compile('(' + '|'.join(SUBDIRECTORY) + ')', re.VERBOSE) elif YEARS: #-- Sync particular years for product regex_pattern = '|'.join('{0:d}'.format(y) for y in YEARS) R2 = re.compile('({0}).(\d+).(\d+)'.format(regex_pattern), re.VERBOSE) else: #-- Sync all available subdirectories for product R2 = re.compile('(\d+).(\d+).(\d+)', re.VERBOSE) #-- for each icesat2 product listed for p in PRODUCTS: print('PRODUCT={0}'.format(p), file=fid) #-- get directories from remote directory (* splat operator) remote_directories = ['ATLAS', '{0}.{1}'.format(p, RELEASE)] d = posixpath.join(HOST, *remote_directories) req = urllib2.Request(url=d) #-- compile regular expression operator for product, release and version args = (p, regex_track, regex_granule, RELEASE, regex_version, regex_suffix) R1 = re.compile(remote_regex_pattern.format(*args), re.VERBOSE) #-- read and parse request for subdirectories (find column names) tree = lxml.etree.parse(urllib2.urlopen(req), parser) colnames = tree.xpath('//td[@class="indexcolname"]//a/@href') remote_sub = [sd for sd in colnames if R2.match(sd)] #-- for each remote subdirectory for sd in remote_sub: #-- local directory for product and subdirectory if FLATTEN: local_dir = os.path.expanduser(ddir) else: local_dir = os.path.join(ddir, '{0}.{1}'.format(p, RELEASE), sd) #-- check if data directory exists and recursively create if not os.makedirs(local_dir, MODE) if not os.path.exists(local_dir) else None #-- find ICESat-2 data files req = urllib2.Request(url=posixpath.join(d, sd)) #-- read and parse request for remote files (columns and dates) tree = lxml.etree.parse(urllib2.urlopen(req), parser) colnames = tree.xpath('//td[@class="indexcolname"]//a/@href') collastmod = tree.xpath('//td[@class="indexcollastmod"]/text()') #-- find matching files (for granule, release, version, track) remote_file_lines = [ i for i, f in enumerate(colnames) if R1.match(f) ] #-- sync each ICESat-2 data file for i in remote_file_lines: #-- remote and local versions of the file remote_file = posixpath.join(d, sd, colnames[i]) local_file = os.path.join(local_dir, colnames[i]) #-- get last modified date and convert into unix time LMD = time.strptime(collastmod[i].rstrip(), '%Y-%m-%d %H:%M') remote_mtime = calendar.timegm(LMD) #-- sync ICESat-2 files with NSIDC server http_pull_file(fid, remote_file, remote_mtime, local_file, LIST, CLOBBER, MODE) #-- close request req = None #-- close log file and set permissions level to MODE if LOG: fid.close() os.chmod(os.path.join(ddir, LOGFILE), MODE)
def play_link(chn, src): item = xbmcgui.ListItem(chn) d_progress = xbmcgui.DialogProgress() d_progress.create("", addon.getLocalizedString(30009)) cj = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) #login if required if src in data['sources'] and 'login' in data['sources'][src] and data[ 'sources'][src]['login'] == "true": url = data['sources'][src]['url'] values = data['sources'][src]['post'] post_data = urllib.urlencode(values) response = opener.open(url, post_data) the_page = response.read() #m3u8 url from fpt if data['channels'][chn]['src']['playpath'] == "m3u8_fpt": url = 'http://fptplay.net/show/getlinklivetv' page_id = data['channels'][chn]['src']['page_id'] page_q = data['channels'][chn]['src']['page_q'] values = { 'id': page_id, 'type': 'newchannel', 'quality': page_q, 'mobile': 'web' } post_data = urllib.urlencode(values) header = { 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'fptplay.net', 'Origin': 'http://fptplay.net', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://fptplay.net/livetv' } req = urllib2.Request(url, post_data, header) response = urllib2.urlopen(req) the_page = response.read() the_data = json.loads(the_page) full_url = the_data['stream'] #m3u8 url from tvnet elif data['channels'][chn]['src']['playpath'] == "m3u8_tvnet": url = 'http://118.107.85.21:1337/get-stream.json?p=smil:' + data[ 'channels'][chn]['src']['page_id'] + '.smil&t=l' stringA = opener.open(url).read().decode('utf-8') stringB = '"url": "' stringC = '"' full_url_BC = re.search(stringB + "(.*?)" + re.escape(stringC), stringA).group(1) full_url = full_url_BC print full_url #m3u8 url using before & after marker elif data['channels'][chn]['src']['playpath'] == "m3u8_bau": if data['channels'][chn]['src'].get('header'): header = (data['channels'][chn]['src']['header']) else: header = None if data['channels'][chn]['src'].get('post'): post = data['channels'][chn]['src']['post'] else: post = None if header == None: req = urllib2.Request(data['channels'][chn]['src']['page_url'], post) else: req = urllib2.Request(data['channels'][chn]['src']['page_url'], post, header) response = urllib2.urlopen(req) #print(response) the_page = response.read() #print(the_page) stringA = the_page stringB = (data['channels'][chn]['src']['url_before']) stringC = (data['channels'][chn]['src']['url_after']) full_url = re.search( re.escape(stringB) + "(.*?)" + re.escape(stringC), stringA).group(1) print(full_url) #traditional rtmp(e) else: videoUrl = data['sources'][src]['url'] playpath = data['channels'][chn]['src']['playpath'] if (playpath != ''): videoUrl = videoUrl + "/" + playpath url_protocol = videoUrl.split(':')[0] if (url_protocol == "http"): full_url = videoUrl elif (url_protocol in ["rtmp", "rtmpe"]): swfUrl = data['sources'][src]['swfurl'] pageUrl = data['sources'][src]['pageurl'] if (data['channels'][chn]['src']['referer'] != ''): pageUrl = pageUrl + "/" + data['channels'][chn]['src'][ 'referer'] flashVer = 'LNX_11,2,202,233' token = data['sources'][src]['token'] app = data['sources'][src]['app'] full_url = videoUrl + ' swfVfy=1 live=1 token=' + token + ' playpath=' + playpath + ' flashVer=' + flashVer + ' pageUrl=' + pageUrl + ' tcUrl=' + videoUrl + ' swfUrl=' + swfUrl d_progress.close() xbmc.Player().play(full_url) return
def __init__(self): super(WebService, self).__init__() self._cookie = CookieJar() self._opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(self._cookie)) self.query_interval = 1.0
def downNASAEarthdata(productname, **kwargs): from cookielib import CookieJar TRMM_DAILY = False TRMM_MONTH = False if StringMatch(productname, "TRMM_3B42_Daily"): TRMM_DAILY = True elif StringMatch(productname, "TRMM_3B43"): TRMM_MONTH = True usrname = '' pwd = '' startdate = datetime.datetime.today() enddate = datetime.datetime.today() outpath = '' # try to get the required key-values, or throw exception try: usrname = kwargs["usrname"] pwd = kwargs["pwd"] startdate = kwargs["startdate"] enddate = kwargs["enddate"] outpath = kwargs["workspace"] except KeyError: print ("downNASAEarthdata function must have the usrname, pwd, startdate, and enddate args.") # try to get optional key-values logfile = None if 'log' in kwargs.keys(): logfile = kwargs['log'] delfile(logfile) authorizeUrl = "https://urs.earthdata.nasa.gov" # Create a password manager to deal with the 401 response that is returned from authorizeUrl password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() password_manager.add_password(None, authorizeUrl, usrname, pwd) # Create a cookie jar for storing cookies. This is used to store and return # the session cookie given to use by the data server (otherwise it will just # keep sending us back to Earthdata Login to authenticate). Ideally, we # should use a file based cookie jar to preserve cookies between runs. This # will make it much more efficient. cookie_jar = CookieJar() # Install all the handlers. opener = urllib2.build_opener( urllib2.HTTPBasicAuthHandler(password_manager), # urllib2.HTTPHandler(debuglevel=1), # Uncomment these two lines to see # urllib2.HTTPSHandler(debuglevel=1), # details of the requests/responses urllib2.HTTPCookieProcessor(cookie_jar)) urllib2.install_opener(opener) downUrl = "http://disc2.gesdisc.eosdis.nasa.gov/" if TRMM_DAILY: downUrl += "data//TRMM_L3/TRMM_3B42_Daily.7/%s/%s/3B42_Daily.%s.7.nc4" elif TRMM_MONTH: downUrl += "opendap/TRMM_L3/TRMM_3B43.7/%s/%s/3B43.%s.7.HDF.nc" tmpdate = startdate while tmpdate <= enddate: if TRMM_DAILY: tmpUrl = downUrl % (tmpdate.strftime('%Y'), tmpdate.strftime('%m'), tmpdate.strftime('%Y%m%d')) deltadays = 1 elif TRMM_MONTH: # get the first day of current month tmpdate = tmpdate.replace(day = 1) tmpUrl = downUrl % (tmpdate.strftime('%Y'), str(doy(tmpdate)).zfill(3), tmpdate.strftime('%Y%m%d')) deltadays = GetDayNumber(tmpdate.year, tmpdate.month) saveName = tmpUrl.split("/")[-1] tmpfile = outpath + os.sep + saveName print2log(" -- %s, saved as %s\n" % (tmpdate.strftime('%Y%m%d'), saveName), logfile = logfile) if isfileexist(tmpfile): tmpdate += datetime.timedelta(days = deltadays) continue while True: # Create and submit the request. try: print2log(tmpUrl, logfile = logfile) request = urllib2.Request(tmpUrl) response = urllib2.urlopen(request) chunk_read(response, savepath = tmpfile, report_hook = chunk_report) break except urllib2.HTTPError or urllib2.URLError, e: # print e.code if e.code == 404 and TRMM_MONTH: tmpUrl = tmpUrl.replace('7.HDF.nc', '7A.HDF.nc') continue else: break tmpdate += datetime.timedelta(days = deltadays)
import urllib2 from urllib2 import urlopen import re import cookielib from cookielib import CookieJar import time ob = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(ob)) opener.addheaders = [('User-agent','Mozilla/5.0')] def main(): try: page = "https://www.huffingtonpost.com/section/taste/feed" sourcecode = opener.open(page).read() #full sc #print sourcecode try: titles = re.findall(r'<title>(.*?)</title> ',sourcecode) links = re.findall(r'<link>(.*?)</link>',sourcecode) #for title in titles: # print title for link in links: print 'Visiting',link linksource = opener.open(link).read() #print linksource content = re.findall(r'<div>(.*?)</div>',linksource) for theContent in content: print theContent
from idasix import QtCore import urllib import urllib2 from cookielib import CookieJar from json import loads, dumps import exceptions from . import config, logger # building opener cookiejar = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) _threadpool = QtCore.QThreadPool() _threadpool.setMaxThreadCount(config['network']['threadcount']) class WorkerSignals(QtCore.QObject): result_dict = QtCore.pyqtSignal(dict) result_list = QtCore.pyqtSignal(list) result_str = QtCore.pyqtSignal(str) result_exception = QtCore.pyqtSignal(Exception) class QueryWorker(QtCore.QRunnable): def __init__(self, method, url, server=None, token=None,
def main(): try: ssl._create_default_https_context = ssl._create_unverified_context opener = wdf_urllib.build_opener( wdf_urllib.HTTPCookieProcessor(CookieJar())) wdf_urllib.install_opener(opener) except: pass if not getUUID(): print('获取uuid失败') return showQRImage() time.sleep(1) while waitForLogin() != '200': pass os.remove(QRImagePath) if not login(): print('登录失败') return if not webwxinit(): print('初始化失败') return MemberList = webwxgetcontact() MemberCount = len(MemberList) print('通讯录共%s位好友' % MemberCount) ChatRoomName = '' result = [] d = {} for Member in MemberList: d[Member['UserName']] = (Member['NickName'].encode('utf-8'), Member['RemarkName'].encode('utf-8')) print('开始查找...') group_num = int(math.ceil(MemberCount / float(MAX_GROUP_NUM))) for i in range(0, group_num): UserNames = [] for j in range(0, MAX_GROUP_NUM): if i * MAX_GROUP_NUM + j >= MemberCount: break Member = MemberList[i * MAX_GROUP_NUM + j] UserNames.append(Member['UserName']) # 新建群组/添加成员 if ChatRoomName == '': (ChatRoomName, DeletedList) = createChatroom(UserNames) else: DeletedList = addMember(ChatRoomName, UserNames) DeletedCount = len(DeletedList) if DeletedCount > 0: result += DeletedList # 删除成员 deleteMember(ChatRoomName, UserNames) # 进度条 progress_len = MAX_PROGRESS_LEN progress = '-' * progress_len progress_str = '%s' % ''.join( map(lambda x: '#', progress[:(progress_len * (i + 1)) / group_num])) print(''.join([ '[', progress_str, ''.join('-' * (progress_len - len(progress_str))), ']' ])) print('新发现你被%d人删除' % DeletedCount) for i in range(DeletedCount): if d[DeletedList[i]][1] != '': print(d[DeletedList[i]][0] + '(%s)' % d[DeletedList[i]][1]) else: print(d[DeletedList[i]][0]) if i != group_num - 1: print('正在继续查找,请耐心等待...') # 下一次进行接口调用需要等待的时间 time.sleep(INTERFACE_CALLING_INTERVAL) # todo 删除群组 print('\n结果汇总完毕,20s后可重试...') resultNames = [] for r in result: if d[r][1] != '': resultNames.append(d[r][0] + '(%s)' % d[r][1]) else: resultNames.append(d[r][0]) print('---------- 被删除的好友列表(共%d人) ----------' % len(result)) # 过滤emoji resultNames = map(lambda x: re.sub(r'<span.+/span>', '', x), resultNames) if len(resultNames): print('\n'.join(resultNames)) else: print("无") print('---------------------------------------------')
def __init__(self, url, **kwargs): self.request = None self.response = None self.code = -1 self.info = {} self.cookieJar = None self.reason = '' data = kwargs.get('data', None) if data: if isinstance(data, dict): data = urlencode(data) if not isinstance(data, basestring): raise ValueError('data must be string or dict') request_type = kwargs.get('type', 'POST') if data and isinstance(request_type, basestring) and request_type.upper() != 'POST': url = '{}?{}'.format(url, data) data = None # GET data must be None self.request = urlRequest(url, data) # referer referer = kwargs.get('referer', None) if referer: self.request.add_header('referer', referer) # user-agent user_agent = kwargs.get('user_agent', None) if user_agent: self.request.add_header('User-Agent', user_agent) # auth auth = kwargs.get('auth', None) if auth and isinstance(auth, dict) and 'usr' in auth: auth_string = base64.b64encode('{}:{}'.format( auth.get('usr', ''), auth.get('pwd', ''))) self.request.add_header('Authorization', 'Basic {}'.format(auth_string)) # cookie cookie = kwargs.get('cookie', None) cj = None if cookie: if isinstance(cookie, CookieJar): cj = cookie elif isinstance(cookie, dict): result = [] for k, v in cookie.items(): result.append('{}={}'.format(k, v)) cookie = '; '.join(result) elif isinstance(cookie, Cookie.BaseCookie): cookie = cookie.output(header='') if isinstance(cookie, basestring): self.request.add_header('Cookie', cookie) if cj is None: cj = CookieJar() #! TODO: proxy # build opener debuglevel = 1 if kwargs.get('debug', False) else 0 opener = build_opener(HTTPHandler(debuglevel=debuglevel), HTTPSHandler(debuglevel=debuglevel), HTTPCookieProcessor(cj)) # timeout timeout = kwargs.get('timeout') if not isinstance(timeout, int): timeout = _DEFAULT_TIMEOUT try: self.response = opener.open(self.request, timeout=timeout) self.code = self.response.getcode() self.header = self.response.info().dict self.cookieJar = cj except HTTPError as e: self.code = e.code self.reason = '{}'.format(e) raise e except URLError as e: self.code = -1 self.reason = e.reason raise e except Exception as e: self.code = -1 self.reason = '{}'.format(e) raise e
def main(): try: ssl._create_default_https_context = ssl._create_unverified_context opener = wdf_urllib.build_opener( wdf_urllib.HTTPCookieProcessor(CookieJar())) opener.addheaders = [ ('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36')] wdf_urllib.install_opener(opener) except: pass if not getUUID(): print('获取uuid失败') return print('正在获取二维码图片...') showQRImage() time.sleep(1) while waitForLogin() != '200': pass os.remove(QRImagePath) if not login(): print('登录失败') return if not webwxinit(): print('初始化失败') return MemberList = webwxgetcontact() print('开启心跳线程') thread.start_new_thread(heartBeatLoop, ()) MemberCount = len(MemberList) print('通讯录共%s位好友' % MemberCount) ChatRoomName = '' result = [] d = {} for Member in MemberList: d[Member['UserName']] = (Member['NickName'].encode( 'utf-8'), Member['RemarkName'].encode('utf-8')) print('开始查找...') group_num = int(math.ceil(MemberCount / float(MAX_GROUP_NUM))) for i in range(0, group_num): UserNames = [] for j in range(0, MAX_GROUP_NUM): if i * MAX_GROUP_NUM + j >= MemberCount: break Member = MemberList[i * MAX_GROUP_NUM + j] UserNames.append(Member['UserName']) # 新建群组/添加成员 if ChatRoomName == '': (ChatRoomName, DeletedList, BlockedList) = createChatroom( UserNames) else: (DeletedList, BlockedList) = addMember(ChatRoomName, UserNames) # todo BlockedList 被拉黑列表 DeletedCount = len(DeletedList) if DeletedCount > 0: result += DeletedList # 删除成员 deleteMember(ChatRoomName, UserNames) # 进度条 progress = MAX_PROGRESS_LEN * (i + 1) / group_num print('[', '#' * progress, '-' * (MAX_PROGRESS_LEN - progress), ']', end=' ') print('新发现你被%d人删除' % DeletedCount) for i in range(DeletedCount): if d[DeletedList[i]][1] != '': print(d[DeletedList[i]][0] + '(%s)' % d[DeletedList[i]][1]) else: print(d[DeletedList[i]][0]) if i != group_num - 1: print('正在继续查找,请耐心等待...') # 下一次进行接口调用需要等待的时间 time.sleep(INTERFACE_CALLING_INTERVAL) # todo 删除群组 print('\n结果汇总完毕,20s后可重试...') resultNames = [] for r in result: if d[r][1] != '': resultNames.append(d[r][0] + '(%s)' % d[r][1]) else: resultNames.append(d[r][0]) print('---------- 被删除的好友列表(共%d人) ----------' % len(result)) # 过滤emoji resultNames = map(lambda x: re.sub(r'<span.+/span>', '', x), resultNames) if len(resultNames): print('\n'.join(resultNames)) else: print("无") print('---------------------------------------------') # windows下编码问题修复 # http://blog.csdn.net/heyuxuanzee/article/details/8442718 class UnicodeStreamFilter: def __init__(self, target): self.target = target self.encoding = 'utf-8' self.errors = 'replace' self.encode_to = self.target.encoding def write(self, s): if type(s) == str: s = s.decode('utf-8') s = s.encode(self.encode_to, self.errors).decode(self.encode_to) self.target.write(s) if sys.stdout.encoding == 'cp936': sys.stdout = UnicodeStreamFilter(sys.stdout) if __name__ == '__main__': print('本程序的查询结果可能会引起一些心理上的不适,请小心使用...') main() print('回车键退出...')
def main(year): # The user credentials that will be used to authenticate access to the data username = "******" password = "" # The FULL url of the directory which contains the files you would like to bulk download #url = "https://daacdata.apps.nsidc.org/pub/DATASETS/nsidc0079_gsfc_bootstrap_seaice_v3/final-gsfc/north/daily/"+str(year)+'/' # Example URL url = "https://daacdata.apps.nsidc.org/pub/DATASETS/nsidc0116_icemotion_vectors_v3/data/north/grid/" + str( year) + "/" # Create a password manager to deal with the 401 reponse that is returned from # Earthdata Login password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password) # Create a cookie jar for storing cookies. This is used to store and return # the session cookie given to use by the data server (otherwise it will just # keep sending us back to Earthdata Login to authenticate). Ideally, we # should use a file based cookie jar to preserve cookies between runs. This # will make it much more efficient. cookie_jar = CookieJar() # Install all the handlers. opener = urllib2.build_opener( urllib2.HTTPBasicAuthHandler(password_manager), urllib2.HTTPHandler(debuglevel=1), # Uncomment these two lines to see #urllib2.HTTPSHandler(debuglevel=1), # details of the requests/responses urllib2.HTTPCookieProcessor(cookie_jar)) urllib2.install_opener(opener) # Create and submit the requests. There are a wide range of exceptions that # can be thrown here, including HTTPError and URLError. These should be # caught and handled. #=============================================================================== # Open a requeset to grab filenames within a directory. Print optional #=============================================================================== DirRequest = urllib2.Request(url) DirResponse = urllib2.urlopen(DirRequest) # Get the redirect url and append 'app_type=401' # to do basic http auth DirRedirect_url = DirResponse.geturl() DirRedirect_url += '&app_type=401' # Request the resource at the modified redirect url DirRequest = urllib2.Request(DirRedirect_url) DirResponse = urllib2.urlopen(DirRequest) DirBody = DirResponse.read(DirResponse) # Uses the HTML parser defined above to pring the content of the directory containing data parser = MyHTMLParser() parser.feed(DirBody) Files = parser.dataList # Display the contents of the python list declared in the HTMLParser class # print Files #Uncomment to print a list of the files #=============================================================================== # Call the function to download all files in url #=============================================================================== BatchJob( Files, cookie_jar, year, url) # Comment out to prevent downloading to your working directory
def __init__(self): super(Bing, self).__init__() self.cj = CookieJar() self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor( self.cj))
Created on 2014. 9. 17. @author: a141890 ''' from cookielib import CookieJar import codecs import csv import json import urllib import urllib2 import urlparse # Global Settings URL = "http://apis.daum.net/local/geo/transcoord" CJ = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(CJ)) def start(): file = codecs.open('wifi.csv', 'r', encoding="EUC-KR") csv_file = csv.reader(file) response = [] for line in csv_file: x, y = line[3], line[4] try: x, y = float(x), float(y) except: continue
#!/usr/bin/env python # coding=utf8 from re import compile, DOTALL import json import tempfile import os from urllib2 import urlopen, HTTPCookieProcessor, build_opener from cookielib import CookieJar from urllib import urlencode import sys reload(sys) sys.setdefaultencoding('utf8') cookie = CookieJar() opener = build_opener(HTTPCookieProcessor(cookie)) def std_write(thing): sys.stdout.write("{}\r\n".format(thing)) # this `try_url` does not require quit much but not https, better with no redirection try_url = 'http://www.baidu.com' # ip and port of the url ip_port = '' def downloader(url):