def save_file(self, source_path, key): """ :param source_path: is relative to the local file system. :param key: the key is relative to the current prefix. :return: """ if self.local_server: if source_path.startswith('/'): source_path = "/" + source_path return self.local_server.copy(source_path, key) # proxy = os.environ.get('HTTP_PROXY') # c.setopt(c.PROXY, proxy) # logger.print('proxy:', proxy) from pycurl import Curl c = Curl() c.setopt(c.URL, self.url) c.setopt(c.TIMEOUT, 3600) c.setopt(c.HTTPPOST, [ ('file', ( c.FORM_FILE, source_path, c.FORM_FILENAME, key, c.FORM_CONTENTTYPE, 'plain/text', )), ]) c.perform() c.close()
def save_buffer(self, buffer, key): # proxy = os.environ.get('HTTP_PROXY') # c.setopt(c.PROXY, proxy) # logger.print('proxy:', proxy) if isinstance(buffer, BytesIO): from requests_toolbelt import MultipartEncoder encoder = MultipartEncoder({'file': (key, buf), 'canary': true}) self.session.post(self.url, data=encoder, headers={'Content-Type': encoder.content_type}) elif isinstance(buffer, StringIO): from pycurl import Curl c = Curl() c.setopt(c.URL, self.url) c.setopt(c.TIMEOUT, 3600) c.setopt(c.HTTPPOST, [ ('file', ( c.FORM_BUFFER, source_path, c.FORM_BUFFERPTR, buffer.read(), c.FORM_CONTENTTYPE, 'plain/text', )), ]) c.perform() c.close()
def curl_ix(content=[]): # Provide a filename to generate a ix.io link # import necesssary classes and functions global ERROR from pycurl import Curl from io import BytesIO from urllib.parse import urlencode curl=Curl() buf=BytesIO() curl.setopt(curl.URL, "ix.io") curl.setopt(curl.WRITEDATA, buf) if content==[]: try: with open(LOGFILE, 'r') as f: content=f.readlines() except FileNotFoundError: ERROR(f"{LOGFILE} not found.") except Exception as e: ERROR(f"Error occured:\n{str(e)}") curl.setopt(curl.POSTFIELDS, urlencode({"f:1": '\n'.join(content)})) try: curl.perform() except Exception as e: ERROR(f"Error occured:\n{str(e)}") curl.close() return buf.getvalue().decode().strip()
def curl(url, file_ids, log): log.info('\tstarting curl fetch of gdc files') params = {'ids': file_ids} c = None with open('gdc_curl_download.tar.gz', 'wb') as f: try: c = Curl() c.setopt(c.URL, url) c.setopt(c.WRITEDATA, f) c.setopt(c.HTTPHEADER, ["Content-Type: application/json"]) c.setopt(pycurl.CUSTOMREQUEST, "POST") c.setopt(pycurl.POSTFIELDS, json.dumps(params)) # TODO: set up using a local certificate c.setopt(pycurl.SSL_VERIFYPEER, 0) c.setopt(pycurl.SSL_VERIFYHOST, 0) c.perform() except: log.exception('problem with curl') raise finally: if None != c: if int(c.getinfo(pycurl.RESPONSE_CODE)) != 200: f.close() with open('gdc_curl_download.tar.gz') as e: err = e.read() log.error('\tbad status on curl call(%s):\n%s' % (c.getinfo(pycurl.RESPONSE_CODE), err)) c.close()
def getc(url): buf = BytesIO() c = Curl() c.setopt(c.URL, url) c.setopt(c.WRITEDATA, buf) c.perform() c.close() return buf
def build_thread(gitpath, ref, buildid, cburl=None, submodules=False): tmpdir = os.path.join(conf('buildbot.buildpath'), buildid) repo = GitRepository(tmpdir) output, retcode = repo.clone(gitpath) if retcode: buildlog(buildid, 'Unable to clone %s. %s\n' % (gitpath, '\n'.join(output))) return output, retcode = repo.checkout(ref) if retcode: buildlog(buildid, 'Unable to checkout %s. %s\n' % (ref, '\n'.join(output))) return if submodules: output, retcode = repo.submodule_init() buildlog(buildid, output[0]) buildlog(buildid, output[1]) output, retcode = repo.submodule_update() buildlog(buildid, output[0]) buildlog(buildid, output[1]) resultsdir = os.path.join(tmpdir, '.build_results') os.makedirs(resultsdir) output, retcode = repo.build(conf('buildbot.signkey'), conf('buildbot.pbuilderrc'), resultsdir) buildlog(buildid, output[0]) buildlog(buildid, output[1]) #logging.debug(output[0]) #logging.debug(output[1]) os.chdir(resultsdir) if not os.listdir(resultsdir) or retcode != 0: buildlog(buildid, 'Nothing in results directory. Giving up.') return tarpath = os.path.join(tmpdir, 'package.tar.gz') tar = tarfile.open(tarpath, 'w:gz') for name in os.listdir(resultsdir): tar.add(name) tar.close() buildlog(buildid, 'Build complete. Results in %s\n' % tarpath) data = file(tarpath, 'rb').read() buildlog(buildid, 'Built %i byte tarball' % len(data)) if cburl: buildlog(buildid, 'Performing callback: %s' % cburl) req = Curl() req.setopt(req.POST, 1) req.setopt(req.URL, str(cburl)) req.setopt(req.HTTPPOST, [('package', (req.FORM_FILE, str(tarpath)))]) req.setopt(req.WRITEDATA, file('%s/build.log' % tmpdir, 'a+')) req.perform() req.close()
def blocking_io(num): # TODO: Use pycurl buf = BytesIO() c = Curl() c.setopt(c.URL, f'https://xkcd.com/{num}/info.0.json') c.setopt(c.WRITEDATA, buf) c.setopt(c.CAINFO, certifi.where()) c.perform() c.close() buf.seek(0) return load(buf)
def load_url(self, url): buffer = BytesIO() c = Curl() c.setopt(c.URL, url) c.setopt(c.WRITEDATA, buffer) c.perform() c.close() return CSVFile(buffer)
def curl(url): io = BytesIO() c = Curl() c.setopt(c.URL, url) c.setopt(c.WRITEDATA, io) c.perform() c.close() res = io.getvalue() io.close() return res
def get(name): base = 'https://www1.ncdc.noaa.gov/pub/data/igra/data/data-por/{}-data.txt.zip' buf = BytesIO() c = Curl() c.setopt(c.URL, base.format(name)) c.setopt(c.WRITEDATA, buf) c.perform() c.close() z = ZipFile(buf) out = z.open(z.infolist()[0]).read() z.close() return out.decode()
def performSubmission(submissionFileName, POST_DATA): logging.info('Performing submission of ' + submissionFileName + '\n') logging.info('POST Data:\n' + str(POST_DATA) + '\n') if (str(getConfigurationValue('test_submission')) == '0'): logging.info ('THIS IS A LIVE SUBMISSION AT ENA.') requestURL = str(getConfigurationValue('ena_rest_address_prod')) + '?auth=ENA%20' + str(getConfigurationValue('ena_username')) + '%20' + str(getConfigurationValue('ena_password')) else: logging.info ('THIS IS A TEST SUBMISSION AT ENA.') requestURL = str(getConfigurationValue('ena_rest_address_test')) + '?auth=ENA%20' + str(getConfigurationValue('ena_username')) + '%20' + str(getConfigurationValue('ena_password')) # Problem: StringIO Doesn't work with pycurl in python 3.6. Must replace this with a BytesIO. #curlResponseBuffer = StringIO() curlResponseBuffer = BytesIO() try: curlObject = Curl() curlObject.setopt(curlObject.URL, requestURL) curlObject.setopt(curlObject.POST, 1) curlObject.setopt(curlObject.HTTPPOST, POST_DATA) curlObject.setopt(curlObject.USERAGENT, 'Curl') curlObject.setopt(curlObject.WRITEFUNCTION, curlResponseBuffer.write) curlObject.setopt(HTTPHEADER, ['Accept:application/xml']) # Insecure. Any security experts want to make this better? curlObject.setopt(SSL_VERIFYHOST, 0) curlObject.setopt(SSL_VERIFYPEER, 0) curlObject.perform() curlObject.close() except Exception: logging.error ('Exception when performing CURL:\n') #logging.error (str(exc_info())) logging.error('Exception when performing CURL.\n') logging.error('URL:' + str(requestURL)) raise responseText = curlResponseBuffer.getvalue() #logging.info ('the type of the responseText is:' + str(type(responseText))) #logging.info ('after it becomes a string:' + str(type(str(responseText)))) # write XML to file. projectSubResultsFileName = submissionFileName.replace('.xml','_results.xml') resultsFile = createOutputFile(projectSubResultsFileName) resultsFile.write(str(responseText)) resultsFile.close() return responseText
def performSubmission(submissionFileName, POST_DATA, enaUserName, enaPassword): logging.info('Performing submission of ' + submissionFileName + '\n') logging.info('POST Data:\n' + str(POST_DATA) + '\n') if (str(getConfigurationValue('test_submission')) == '0'): logging.info ('THIS IS A LIVE SUBMISSION AT ENA.') requestURL = str(getConfigurationValue('ena_rest_address_prod')) + '?auth=ENA%20' + str(enaUserName) + '%20' + str(enaPassword) else: logging.info ('THIS IS A TEST SUBMISSION AT ENA.') requestURL = str(getConfigurationValue('ena_rest_address_test')) + '?auth=ENA%20' + str(enaUserName) + '%20' + str(enaPassword) # Problem: StringIO Doesn't work with pycurl in python 3.6. Must replace this with a BytesIO. curlResponseBuffer = BytesIO() try: curlObject = Curl() curlObject.setopt(curlObject.URL, requestURL) curlObject.setopt(curlObject.POST, 1) curlObject.setopt(curlObject.HTTPPOST, POST_DATA) curlObject.setopt(curlObject.USERAGENT, 'Curl') curlObject.setopt(curlObject.WRITEFUNCTION, curlResponseBuffer.write) curlObject.setopt(HTTPHEADER, ['Accept:application/xml']) # Insecure. Any security experts want to make this better? curlObject.setopt(SSL_VERIFYHOST, 0) curlObject.setopt(SSL_VERIFYPEER, 0) curlObject.perform() curlObject.close() except Exception: logging.error ('Exception when performing CURL:\n') #logging.error (str(exc_info())) logging.error('Exception when performing CURL.\n') logging.error('URL:' + str(requestURL)) raise responseText = curlResponseBuffer.getvalue() #logging.info ('the type of the responseText is:' + str(type(responseText))) #logging.info ('after it becomes a string:' + str(type(str(responseText)))) # write XML to file. projectSubResultsFileName = submissionFileName.replace('.xml','_results.xml') resultsFile = createOutputFile(projectSubResultsFileName) resultsFile.write(str(responseText)) resultsFile.close() return responseText
def fetchAnnotationJson(self, rawRequestURL=None): try: postData = {'sequence': self.rawSequence} # Using configuration here causes circular dependency. So I'll just pass it in. if(rawRequestURL is None): logging.error('You must pass a rawRequestURL to fetchAnnotationJson.') return else: requestURL = rawRequestURL + '?' + urlencode(postData) resultsIoObject = BytesIO() curlObject = Curl() curlObject.setopt(curlObject.URL, requestURL) curlObject.setopt(curlObject.WRITEDATA, resultsIoObject) curlObject.perform() curlObject.close() getBody = resultsIoObject.getvalue().decode('utf8') logging.debug('JSON Request Body:\n' + getBody) # TODO: # Detect error <head><title>414 Request-URI Too Large</title></head> # For larger DRB alleles the webserver fails. # Detect error if the result is not json. # Maybe this error detection happens in parseExons. But i maybe need to detect server errors here. # Simple case is an empty string. if(getBody is None or len(getBody)<1): logging.error('The JSON results were an empty string. Is there a problem with the ACT server?:' + str(requestURL)) showInfoBox('Problem Accessing Annotation Service','The JSON results were an empty string. Is there a problem with the ACT server?') return None # If it's an html error we can respond nicely. if(getBody[0:5]=='<html>'): # TODO: this might not work if i get some other kind of html. errorCode = getBody[getBody.find('<title>'):getBody.find('</title>')] logging.error('The annotation JSON results are html, this probably indicates an issue with the annotation webserver:\n' + str(requestURL)) showInfoBox('Problem Accessing Annotation Service', 'The annotation results are HTML, not JSON, probably an issue with the ACT webserver:\n' + str(errorCode)) return None return getBody except Exception: logging.error('Exception when performing CURL:\n') logging.error(str(exc_info())) logging.error('URL:' + str(requestURL)) raise
def sendTelegramAlert(self, telegram_chat_id, telegram_bot_token, message): if len(message) > 4096: message = "The size of the message in Telegram (4096) has been exceeded. Overall size: " + str( len(message)) c = Curl() url = 'https://api.telegram.org/bot' + str( telegram_bot_token) + '/sendMessage' c.setopt(c.URL, url) data = {'chat_id': telegram_chat_id, 'text': message} pf = urlencode(data) c.setopt(c.POSTFIELDS, pf) c.perform_rs() status_code = c.getinfo(HTTP_CODE) c.close() self.getStatusByTelegramCode(status_code)
def _fetch(self, url, query, on_progress=None): logging.debug('query={query}'.format(query=query)) from pycurl import Curl, POST, POSTFIELDS from io import BytesIO c = Curl() c.setopt(c.URL, url) c.setopt(POST, 1) c.setopt(POSTFIELDS, query) if on_progress: c.setopt(c.HEADERFUNCTION, self._on_header(on_progress)) buffer = BytesIO() c.setopt(c.WRITEDATA, buffer) c.perform() c.close() return buffer.getvalue().decode('UTF-8')
def _curl_a_link(self, target_url, post_target, commit_date=None): ''' 解析一个地址,返回一个字典,从其中可以读取json字符串の内容,相当于curl get指令,如果这个请求的结果在今天的缓存当中已经有了,则从缓存中取,不从elastic里面再重复读取 ''' buffer = StringIO() c = Curl() c.setopt(c.URL, target_url) c.setopt(c.WRITEDATA, buffer) c.perform() c.close() load_target = json.loads(buffer.getvalue()) return load_target pass
def torch_upload(): from ml_logger import logger import numpy as np logger.configure(root_dir="http://54.71.92.65:9080", prefix="geyang/ml_logger-debug/test-1", register_experiment=True) logger.log_params(args={}) with logger.Sync(): import os import torch from pycurl import Curl from tempfile import NamedTemporaryFile logger.remove('upload/example.pt') with NamedTemporaryFile(delete=True) as f: torch.save(np.ones([10_000_000]), f) # torch.save(np.ones([1000_000]), f) logger.print(f.name) c = Curl() c.setopt(c.URL, logger.root_dir) # proxy = os.environ.get('HTTP_PROXY') # c.setopt(c.PROXY, proxy) # logger.print('proxy:', proxy) c.setopt(c.TIMEOUT, 100000) c.setopt(c.HTTPPOST, [ ('file', ( c.FORM_FILE, f.name, c.FORM_FILENAME, logger.prefix + '/upload/example.pt', c.FORM_CONTENTTYPE, 'plain/text', )), ]) c.perform() c.close() logger.print('done') # logger.remove(".") # a = np.ones([1, 1, 100_000_000 // 4]) # logger.print(f"the size of the tensor is {a.size}") # data = dict(key="ok", large=a) # logger.torch_save(data, f"save/data-{logger.now('%H.%M.%S')}.pkl") logger.print('done')
def curlGet(url: str) -> bytes: # Generate objects. data = BytesIO() curl = Curl() # Setup curl. curl.setopt(curl.URL, url) curl.setopt(curl.WRITEDATA, data) curl.setopt(curl.FOLLOWLOCATION, True) curl.setopt(curl.HTTPHEADER, ['User-Agent: curl/7.68.0']) # Send curl request. curl.perform() curl.close() return data.getvalue()
def __fetch_page(self, url): useragent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36' encoding = 'gzip, deflate, sdch' httpheader = [ 'Accept: text/html, application/xhtml+xml, application/xml; q=0.9, image/webp, */*; q=0.8', 'Accept-Language: it-IT, it; q=0.8, en-US; q=0.6, en; q=0.4', 'Host: uniparthenope.esse3.cineca.it' ] cookiefile = 'cookiefile' page = StringIO() c = Curl() c.setopt(c.FOLLOWLOCATION, True) c.setopt(c.WRITEFUNCTION, page.write) c.setopt(c.COOKIEJAR, cookiefile) c.setopt(c.URL, url) c.perform() c.close() page.close() page = StringIO() c = Curl() c.setopt(c.USERPWD, self.__username + ':' + self.__password) c.setopt(c.FOLLOWLOCATION, 1) c.setopt(c.WRITEFUNCTION, page.write) c.setopt(c.COOKIEFILE, cookiefile) c.setopt(c.ENCODING, encoding) c.setopt(c.HTTPHEADER, httpheader) c.setopt(c.REFERER, url) c.setopt(c.USERAGENT, useragent) c.setopt(c.URL, url) c.perform() if (c.getinfo(pycurl.HTTP_CODE) != 200): return None c.close() page_str = page.getvalue() page.close() p = re.compile('\\s+') page_str = p.sub(" ", page_str) return page_str
def post_progress(progress, slotX, slotY, exp_name, message="") : from pycurl import Curl import cStringIO from socket import gethostname response = cStringIO.StringIO() address ='www.doc.ic.ac.uk/~zf509/'+exp_name+'/ip.php?name='+gethostname()+\ '-'+message+'&slot='+str(slotX)+'-'+str(slotY)+\ '&stage='+str(progress) c = Curl() c.setopt(c.WRITEFUNCTION, response.write) c.setopt(c.URL, address) c.perform() c.close() server_res = response.getvalue() print "Server replied:", server_res if server_res[0]=="T" and server_res[1]=="E" and server_res[2]=="R" : return False else : return True
def _perform(self, url: str, curl_obj: pycurl.Curl = None, headers: dict = None, postfields: dict = None, skip_auth=False) -> bytes: if not skip_auth: self._wait_authenticated() if not curl_obj: curl_obj = pycurl.Curl() if postfields: postfields = urlencode(postfields) _set_postfields(curl_obj, postfields) logger.debug("url={url}, headers={headers}", url=url, headers=headers) if not headers: headers = self.BASE_HEADERS.copy() headers = self._headers_to_list(headers) logger.debug("prepared headers={h}", h=headers) buffer = BytesIO() curl_obj.setopt(pycurl.WRITEFUNCTION, buffer.write) curl_obj.setopt(pycurl.HEADERFUNCTION, self._header_function) curl_obj.setopt(pycurl.BUFFERSIZE, 102400) curl_obj.setopt(pycurl.URL, url) curl_obj.setopt(pycurl.HTTPHEADER, headers) curl_obj.setopt(pycurl.USERAGENT, CURL_USERAGENT) curl_obj.setopt(pycurl.MAXREDIRS, 50) curl_obj.setopt(pycurl.ACCEPT_ENCODING, "") curl_obj.setopt(pycurl.TCP_KEEPALIVE, 1) curl_obj.setopt(pycurl.FOLLOWLOCATION, True) curl_obj.setopt(pycurl.ENCODING, "gzip, deflate") try: curl_obj.perform() except pycurl.error as e: logger.debug(e, exc_info=True) logger.warning(e) return b"" status = curl_obj.getinfo(pycurl.HTTP_CODE) logger.debug("HTTP status: {s}", s=status) curl_obj.close() if status != HTTPStatus.OK: hdrs = None try: hdrs = {k: v[-1] for k, v in self._headers.items()} except (IndexError, KeyError): pass phrase = "error" try: phrase = http.client.responses[status] logger.error("HTTP status error: {s}", s=status) except KeyError: pass raise HTTPError(url=url, msg=phrase, code=status, hdrs=hdrs, fp=None) # Server changing maps will trigger sessionid change, # keep track of latest sessionid in response headers. sessionid = self._find_sessionid() if sessionid and self._auth_data: self._auth_data.sessionid = sessionid return buffer.getvalue()
def _execute(curl: Curl, close_connection: bool) -> int: curl.perform() status_code = curl.getinfo(curl.HTTP_CODE) if close_connection: curl.close() return status_code
def new_session(self) -> Curl: session = Curl() yield session session.close()
import re import os from io import BytesIO from urllib.parse import urlparse from pycurl import Curl buffer = BytesIO() c = Curl() c.setopt(c.URL, 'http://www.xiachufang.com/') c.setopt(c.WRITEDATA, buffer) c.perform() c.close() body = buffer.getvalue() text = body.decode('utf-8') print(text) img_list = re.findall(r'src=\"(http://i2\.chuimg\.com/\w+\.jpg)', text) # 初始化下载文件目录 image_dir = os.path.join(os.curdir, 'images') # if not os.path.isdir(image_dir): # os.mkdir(image_dir) for img in img_list[::-1]: o = urlparse(img) filename = o.path[1:] filepath = os.path.join(image_dir, filename) if not os.path.isdir(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) url = '%s://%s/%s' % (o.scheme, o.netloc, filename) print(url)
print('') ### CONNECTION REUSE TESTS FOLLOW ### LIBRARY = "pycurl (saving response body by cStringIO BUT MAKING A NEW HANDLE EVERY TIME) " print("Testing {0} performance with {1} cycles".format(LIBRARY, CYCLES)) start = time.clock() for i in xrange(1, CYCLES): mycurl = Curl() mycurl.setopt(mycurl.URL, URL) body = StringIO() mycurl.setopt(mycurl.WRITEDATA, body) mycurl.perform() output = body.getvalue() body.close() mycurl.close() end = time.clock() print('{0}: ran {1} HTTP GET requests in {2} seconds'.format( LIBRARY, CYCLES, (end - start))) print('') LIBRARY = "pycurl (saving response body by cStringIO) " print("Testing {0} CONNECTION REUSE performance with {1} cycles".format( LIBRARY, CYCLES)) mycurl = Curl() mycurl.setopt(mycurl.URL, URL) start = time.clock() for i in xrange(1, CYCLES): body = StringIO()
def processVideo(self, vID, number): for _attempt in range(self.retryCount): title = '' download = None for i in count(): try: self.goTo(vID) title = self.getElement('h1[itemprop=name]').text.strip().rstrip('.') self.driver.find_element_by_class_name('iconify_down_b').click() download = self.getElement('#download') break except NoSuchElementException as e: self.logger.warning(e.msg) if i >= self.retryCount: self.logger.error("Page load failed") self.errors += 1 break # Parse download links link = linkSize = localSize = downloadOK = downloadSkip = None if download: for preference in FILE_PREFERENCES: try: link = download.find_element_by_partial_link_text(preference) break except NoSuchElementException: pass if link: # Parse chosen download link userAgent = str(self.driver.execute_script('return window.navigator.userAgent')) cookies = self.driver.get_cookies() extension = link.get_attribute('download').split('.')[-1] description = '%s/%s' % (link.text, extension.upper()) link = str(link.get_attribute('href')) if self.getFileSizes: try: request = requests.get(link, stream = True, headers = { 'user-agent': userAgent }, cookies = dict((str(cookie['name']), str(cookie['value'])) for cookie in cookies)) request.close() linkSize = int(request.headers['content-length']) self.totalFileSize += linkSize description += ', %s' % readableSize(linkSize) except Exception as e: self.logger.warning(e) else: description = extension = 'NONE' # Prepare file information prefix = ' '.join((title, '(%s)' % description)) suffix = ' '.join((('%d/%d %d%%' % (number, len(self.vIDs), int(number * 100.0 / len(self.vIDs)))),) + ((readableSize(self.totalFileSize),) if self.totalFileSize else ())) self.logger.info(' '.join((prefix, suffix))) fileName = cleanupFileName('%s.%s' % (' '.join(((title,) if title else ()) + (str(vID),)), extension.lower())) targetFileName = join(self.targetDirectory, fileName) if self.setLanguage: try: self.driver.find_element_by_id('change_settings').click() languages = self.driver.find_elements_by_css_selector('select[name=language] option') currentLanguage = ([l for l in languages if l.is_selected()] or [None,])[0] if currentLanguage is None or currentLanguage is languages[0]: ls = [l for l in languages if l.text.capitalize().startswith(self.setLanguage)] if len(ls) != 1: ls = [l for l in languages if l.get_attribute('value').capitalize().startswith(self.setLanguage)] if len(ls) == 1: self.logger.info("Language not set, setting to %s", ls[0].text) ls[0].click() self.driver.find_element_by_css_selector('#settings_form input[type=submit]').click() else: self.logger.error("Unsupported language: %s", self.setLanguage) self.setLanguage = None else: self.logger.info("Language already set to %s / %s", currentLanguage.get_attribute('value').upper(), currentLanguage.text) except NoSuchElementException: self.logger.warning("Failed to set language to %s, settings not available", self.setLanguage) if link: # Downloading file if linkSize: localSize = getFileSize(targetFileName) if localSize == linkSize: downloadOK = True elif localSize and localSize > linkSize: self.errors += 1 self.logger.error("Local file is larger (%d) than remote file (%d)", localSize, linkSize) downloadSkip = True #remove(targetFileName) #localSize = None if self.doDownload and not downloadOK: class ProgressIndicator(object): QUANTUM = 10 * 1024 * 1024 # 10 megabytes ACTION = r'--\\||//' # update() often gets called in pairs, this smoothes things up action = len(ACTION) - 1 def __init__(self, timeout): self.timeout = timeout self.started = False self.totalRead = 0 self.lastData = time() self.count = 0 self.action = len(self.ACTION) - 1 self.progress("Dowloading: ") def progress(self, s, suffix = ''): self.action = (self.action + 1) % len(self.ACTION) print('\b%s%s' % (s, suffix + '\n' if suffix else self.ACTION[self.action]), end = '', flush = True) def update(self, _length, totalRead, *_args): if totalRead <= self.totalRead: if time() > self.lastData + self.timeout: raise curlError("Download seems stalled") else: self.totalRead = totalRead self.lastData = time() oldCount = self.count self.count = int(totalRead // self.QUANTUM) + 1 self.progress(('=' if self.started else '+') * max(0, self.count - oldCount)) self.started = True def end(self): self.progress("OK") progressIndicator = ProgressIndicator(self.timeout) curl = Curl() curl.setopt(curl.CAINFO, certifi.where()) curl.setopt(curl.COOKIE, '; '.join('%s=%s' % (cookie['name'], cookie['value']) for cookie in cookies)) curl.setopt(curl.TIMEOUT, self.timeout) curl.setopt(curl.USERAGENT, userAgent) curl.setopt(curl.FOLLOWLOCATION, True) curl.setopt(curl.URL, link) curl.setopt(curl.PROGRESSFUNCTION, progressIndicator.update) try: with open(targetFileName, 'wb') as f: curl.setopt(curl.WRITEDATA, f) curl.perform() curl.close() progressIndicator.end() downloadOK = True except curlError as e: self.errors += 1 self.logger.error("Download failed: %s", e) except KeyboardInterrupt: self.errors += 1 self.logger.error("Download interrupted") if downloadOK: localSize = getFileSize(targetFileName) if not localSize: self.errors += 1 downloadOK = False self.logger.error("Downloaded file seems corrupt") elif linkSize: if localSize > linkSize: self.errors += 1 downloadOK = False self.logger.error("Downloaded file larger (%d) than remote file (%d)", localSize, linkSize) elif localSize < linkSize: self.errors += 1 downloadOK = False self.logger.error("Downloaded file smaller (%d) than remote file (%d)", localSize, linkSize) if downloadOK: self.logger.info("OK") break elif downloadSkip or not self.doDownload: self.logger.info("Downloading SKIPPED") break else: self.logger.info("Download ultimately failed after %d retries", self.retryCount) # Creating symbolic links, if enabled for dirName in (dirName for (dirName, vIDs) in self.folders if vID in vIDs): linkFileName = join(dirName, fileName) try: if lexists(linkFileName): remove(linkFileName) except: pass try: (hardlink if self.useHardLinks else symlink)(join('..', fileName), linkFileName) except Exception as e: self.logger.warning("Can't create link at %s: %s", linkFileName, e) self.errors += 1
print('') ### CONNECTION REUSE TESTS FOLLOW ### LIBRARY="pycurl (saving response body by cStringIO BUT MAKING A NEW HANDLE EVERY TIME) " print ("Testing {0} performance with {1} cycles".format(LIBRARY, CYCLES)) start = time.clock() for i in xrange(1, CYCLES): mycurl=Curl(); mycurl.setopt(mycurl.URL, URL) body = StringIO(); mycurl.setopt(mycurl.WRITEDATA, body) mycurl.perform() output = body.getvalue() body.close() mycurl.close() end = time.clock() print('{0}: ran {1} HTTP GET requests in {2} seconds'.format(LIBRARY, CYCLES, (end-start))) print('') LIBRARY="pycurl (saving response body by cStringIO) " print ("Testing {0} CONNECTION REUSE performance with {1} cycles".format(LIBRARY, CYCLES)) mycurl=Curl(); mycurl.setopt(mycurl.URL, URL) start = time.clock() for i in xrange(1, CYCLES): body = StringIO(); mycurl.setopt(mycurl.WRITEDATA, body)
def _complete_request(curl: pycurl.Curl, buffer: BytesIO, response: Response): curl.perform() response.status = curl.getinfo(curl.RESPONSE_CODE) response.body = buffer.getvalue().decode(_CHAR_ENCODING) curl.close()
class HttpDirectory(RemoteDirectory): SCHEMES = ( "http", "https", ) HEADERS = config.HEADERS BLACK_LIST = ( "?C=N&O=D", "?C=M&O=A", "?C=S&O=A", "?C=D&O=A", "?C=N;O=D", "?C=M;O=A", "?C=M&O=D", "?C=S;O=A", "?C=S&O=D", "?C=D;O=A", "?MA", "?SA", "?DA", "?ND", "?C=N&O=A", "?C=N&O=A", "?M=A", "?N=D", "?S=A", "?D=A", ) FILE_NAME_BLACKLIST = ( "Parent Directory", " Parent Directory" "../", ) MAX_RETRIES = 2 TIMEOUT = 25 def __init__(self, url): super().__init__(url) self.curl = None self.curl_head = None self.init_curl() def init_curl(self): self.curl = Curl() self.curl.setopt(self.curl.SSL_VERIFYPEER, 0) self.curl.setopt(self.curl.SSL_VERIFYHOST, 0) self.curl.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT) self.curl_head = self._curl_handle() @staticmethod def _curl_handle(): curl_head = Curl() curl_head.setopt(pycurl.SSL_VERIFYPEER, 0) curl_head.setopt(pycurl.SSL_VERIFYHOST, 0) curl_head.setopt(pycurl.NOBODY, 1) curl_head.setopt(pycurl.TIMEOUT, HttpDirectory.TIMEOUT) return curl_head def list_dir(self, path): current_dir_name = path[path.rstrip("/").rfind("/") + 1:-1] path_identifier = hashlib.md5(current_dir_name.encode()) path_url = urljoin(self.base_url, path, "") body = self._fetch_body(path_url) anchors = self._parse_links(body) urls_to_request = [] files = [] for anchor in anchors: if self._should_ignore(self.base_url, path, anchor): continue if self._isdir(anchor): directory = File( name=anchor.href, # todo handle external links here mtime=0, size=0, path=path, is_dir=True) path_identifier.update(bytes(directory)) files.append(directory) else: urls_to_request.append(urljoin(path_url, anchor.href)) for file in self.request_files(urls_to_request): path_identifier.update(bytes(file)) files.append(file) return path_identifier.hexdigest(), files def request_files(self, urls_to_request: list) -> list: if len(urls_to_request) > 150: # Many urls, use multi-threaded solution pool = ThreadPool(processes=10) files = pool.starmap(self._request_file, zip(urls_to_request, repeat(self.base_url))) pool.close() for file in files: if file: yield file else: # Too few urls to create thread pool for url in urls_to_request: file = self._request_file(url, self.base_url) if file: yield file @staticmethod def _request_file(url, base_url): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: curl = HttpDirectory._curl_handle() raw_headers = BytesIO() curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write) curl.perform() stripped_url = url[len(base_url) - 1:] headers = HttpDirectory._parse_dict_header( raw_headers.getvalue().decode("utf-8", errors="ignore")) raw_headers.close() path, name = os.path.split(stripped_url) date = headers.get("Last-Modified", "1970-01-01") curl.close() return File(path=unquote(path).strip("/"), name=unquote(name), size=int(headers.get("Content-Length", -1)), mtime=int(parse_date(date).timestamp()), is_dir=False) except pycurl.error: retries -= 1 logger.debug("TimeoutError - _request_file") raise TimeoutError def _fetch_body(self, url: str): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: content = BytesIO() self.curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) self.curl.setopt(pycurl.WRITEDATA, content) self.curl.perform() return content.getvalue().decode("utf-8", errors="ignore") except pycurl.error: self.close() retries -= 1 logger.debug("TimeoutError - _fetch_body") raise TimeoutError @staticmethod def _parse_links(body): parser = HTMLAnchorParser() parser.feed(body) return parser.anchors @staticmethod def _isdir(link: Anchor): return link.href.endswith("/") @staticmethod def _should_ignore(base_url, current_path, link: Anchor): full_url = urljoin(base_url, link.href) if full_url == urljoin(urljoin(base_url, current_path), "../") or full_url == base_url: return True if link.href.endswith(HttpDirectory.BLACK_LIST): return True # Ignore external links if not full_url.startswith(base_url): return True # Ignore parameters in url if "?" in link.href: return True @staticmethod def _parse_dict_header(raw): headers = dict() for line in raw.split( "\r\n")[1:]: # Ignore first 'HTTP/1.0 200 OK' line if line: k, v = line.split(":", maxsplit=1) headers[k.strip()] = v.strip() return headers def close(self): self.curl.close() self.init_curl()
m = CurlMulti() c = Curl() try: m.remove_handle(c) except pycurl.error: pass else: assert 0, "internal error" del m, c # remove an invalid but closed handle if 1: m = CurlMulti() c = Curl() c.close() m.remove_handle(c) del m, c # add a closed handle: this should fail if 1: m = CurlMulti() c = Curl() c.close() try: m.add_handle(c) except pycurl.error: pass else: assert 0, "internal error"