def test_ftp(directory): path, port = directory ftp = FTP(path, host="127.0.0.1", listen_host="127.0.0.1", listen_port=port) assert (ftp.uri == "ftp://%s:%s/" % ("127.0.0.1", port)) # Test that service start ftp.start() with pytest.raises(OPVDMException): ftp.start() # Check service is running assert (ftp.is_running()) # Connect to server requests_ftp.monkeypatch_session() s = requests.Session() r = s.retr("%s%s" % (ftp.uri, "toto.txt")) assert (r.status_code == 226) assert (r.text == CONTENT_TEST) # Stop http test service ftp.stop() assert (ftp.is_running() is False)
def ftp_fetch(context, data): url = data.get('url') context.log.info("FTP fetch: %s", url) requests_ftp.monkeypatch_session() session = requests.Session() username = context.get('username', 'Anonymous') password = context.get('password', 'anonymous@ftp') cached = context.get_tag(url) if cached is not None: context.emit(rule='pass', data=cached) return resp = session.retr(url, auth=(username, password)) if resp.status_code < 399: data.update({ 'status_code': resp.status_code, 'retrieved_at': datetime.utcnow().isoformat(), 'content_hash': context.store_data(data=resp.content) }) context.set_tag(url, data) context.emit(rule='pass', data=data) else: resp = session.nlst(url, auth=(username, password)) for child in resp.iter_lines(): child_data = data.copy() child_data['url'] = os.path.join(url, child) # context.log.info("FTP directory child: %(url)s", child_data) context.emit(rule='child', data=child_data)
def make_http_request(url, **kwargs): """ Makes http request using requests library :param url: URL to query :return: request Object """ if 'requests_session' in kwargs: s = kwargs['requests_session'] else: s = requests r = [] try: r = s.get(url) #requests.get(url) # determine if require request.post to set cookies if 'set_cookies' in kwargs: if kwargs['set_cookies'] == 'yes': cookies = dict(r.cookies) r = s.post(url, verify=True, cookies=cookies) except requests.exceptions.InvalidSchema: # if url is ftp rather than http requests_ftp.monkeypatch_session() r = requests.Session().get(url) except requests.exceptions.ConnectionError: log.error("URL Connection Error for %s", url) try: r.raise_for_status() except requests.exceptions.HTTPError: log.error('Error in URL request!') return r
def fetch_url(url: str) -> bytes: logger = logging.getLogger(__name__) if url.startswith("http"): logger.debug("fetching url (http) \"%s\"", url) response = requests.get(url) elif url.startswith("ftp"): logger.debug("fetching url (ftp) \"%s\"", url) requests_ftp.monkeypatch_session() s = requests.Session() response = s.get(url) s.close() else: logger.exception("invalid url \"%s\"", url) raise requests.HTTPError("invalid url \"%s\"", url) if response.ok: # fix for url's that return 200 instead of a 404 if int(response.headers.get("Content-Length", 0)) < 1000: logger.exception("Content-Length is very small" "(url is most likely not a valid file)") raise requests.HTTPError("Content-Length is very small" "(url is most likely not a valid file)") return response.content else: logger.exception("failed to fetch url \"%s\" (%i)", url, response.status_code) raise requests.HTTPError("failed to fetch url \"%s\" (%i)", url, response.status_code)
def download_data(file_name): requests_ftp.monkeypatch_session() resp = requests.Session().list('ftp://ftp.dd-wrt.com/betas/2020/') file_path = open(file_name, 'w') file_path.write((resp.content).decode('utf-8')) file_path.close() if (file_name == new_path): compare_files()
def scars(r0): try: requests_ftp.monkeypatch_session() s = requests.Session() # really f*****g slow? r1 = s.get(r0) s.close() return r1.content except: return return
def ftp(): proxies = {'https': p_list[random.randint(0, len(p_list) - 1)]} print(proxies) requests_ftp.monkeypatch_session() with requests.Session() as s: s.proxies.update(proxies) # r = s.get(r'http://jsonip.com', headers=headers) # ip= r.json()['ip'] # print('Your IP is', ip) resp = s.list('ftp://90.130.70.73/', auth=('anonymous', 'anonymous')) print(resp)
def download_a_file(self, file_name, file_address): """Download a specific file""" requests_ftp.monkeypatch_session() session_requests = requests.Session() try: res = session_requests.get(file_address, stream=True) with open(file_name, 'wb') as output: shutil.copyfileobj(res.raw, output) session_requests.close() except Exception as e: logging.exception('Failed to download {}.'.format(file_name)) return False return True
def __init__(self, url, port=21, remote=None, filename=None, local='.', user='******', password='******'): super(self.__class__, self).__init__(url, port, remote, filename, local, user, password) if self.port is None: self.port = 21 if self.user is None: self.user = '******' if self.password is None: self.password = '******' #self.ftp = ftplib.FTP() requests_ftp.monkeypatch_session() self.s = requests.Session()
def latest(self): """Download latest available copy of clinvar database in vcf format.""" import requests import requests_ftp import gzip requests_ftp.monkeypatch_session() with requests.Session() as sess: resp = sess.get('{}/clinvar.vcf.gz'.format(self.base_url)) self.rawdata = gzip.decompress(resp.content) return self.rawdata
def confirm_urls(request): if request.method == 'POST': confirm_result = {'R1': False, 'R2':False, 'Rf': False, 'Lr': False, 'R1_err':'', 'R2_err':'', 'Rf_err':'', 'Lr_err':''} for key, val in request.POST.items(): is_status_ok = False is_format_ok = False if key[:3] == 'url': # e.g., key = url_R1 # m1 = re.search('google\.com.+id=(.+)\&*', val) ## Retired m1 = re.search('google\.com\/file\/d\/(.+)\/', val) if m1: # use GoogleDriveDownloader module id = m1.group(1) response = GoogleDriveDownloader.get_response(id) else: # direct download if not re.match(r'^(http|https|ftp)://', val): val = 'http://'+val requests_ftp.monkeypatch_session() session = requests.Session() response = session.get(val, stream=True) # Check file existing is_status_ok = response.ok if is_status_ok == False: confirm_result[key[-2:]+'_err'] = 'File Not Found' else: # Check file format if m1: m2 = re.search('filename="(.+)"', response.headers['Content-Disposition']) file_name = m2.group(1) else: file_name = response.url.split('/')[-1] if key[-2:] == 'Rf': if re.search('\.(fasta|fa|fna)+(\.gz)*$', file_name): is_format_ok = True else: is_format_ok = False confirm_result[key[-2:]+'_err'] = 'Unkown File Format' else: #R1/R2/Lr if re.search('\.f(ast)*q(\.gz)*$', file_name): is_format_ok = True else: is_format_ok = False confirm_result[key[-2:]+'_err'] = 'Unkown File Format' confirm_result[key[-2:]] = is_status_ok and is_format_ok return HttpResponse(json.dumps(confirm_result))
def make_http_request(url): r = [] try: r = requests.get(url) except requests.exceptions.InvalidSchema: # if url is ftp rather than http requests_ftp.monkeypatch_session() r = requests.Session().get(url) except requests.exceptions.ConnectionError: log.error("URL Connection Error for " + url) try: r.raise_for_status() except requests.exceptions.HTTPError: log.error('Error in URL request!') return r
def _crawl_urls_ftp(self, url, provider): """ Check if a file is present on an FTP server and return the appropriate status code. """ # We need to be able to mock this for testing and requests-mock doesn't # work with requests-ftp, so this is our workaround. We'll just bypass # this method like so (the real method returns either an int or None): test_ftp_status = self.source_config.get('test_ftp_status') if test_ftp_status == 'ok': return 10000 elif test_ftp_status == 'error': return None # And now here's the real method: timeout = self.source_config['timeout'] username = self.source_config['username'] password = self.source_config['password'] # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: requests_ftp.monkeypatch_session() s = requests.Session() r = s.size(url, auth=HTTPBasicAuth(username, password), timeout=timeout) status_code = r.status_code elapsed = r.elapsed.total_seconds() except (ConnectTimeout, ReadTimeout) as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if status_code == 213: size = int(r.text) else: size = None if status_code not in {213, 408}: self._save_gather_error('{} error: {}'.format(status_code, r.text), self.job) self.provider_logger.info( log_message.format(provider, timestamp, status_code, elapsed)) return size
def get_cds_url(self): """cds""" TOPLEVEL = re.compile('{0}.*?.cds.all.fa.gz'.format(self.species.capitalize())) url_1st = "ftp://ftp.ensemblgenomes.org/" url_2nd = "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/fasta/" url_3rd_page = url_2nd + self.species.lower() + "/cds/" requests_ftp.monkeypatch_session() s = requests.Session() url_page = s.list( "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/fasta/{0}/cds".format(self.species.lower())) url_page.encoding = 'utf-8' download_url = re.findall(TOPLEVEL, url_page.text) download_url = "".join(download_url) url = url_3rd_page + download_url print(url) return url
def download_cat(data_path, ebi_download): """ download the data from the ebi main site and ftp""" try: r = requests.get(ebi_download + 'studies_alternative') if r.status_code == 200: catstud_name = r.headers['Content-Disposition'].split('=')[1] with open( os.path.join(data_path, 'catalog', 'raw', 'Cat_Stud.tsv'), 'wb') as tsvfile: tsvfile.write(r.content) diversity_logger.info('Successfully downloaded ' + catstud_name) else: diversity_logger.debug('Problem downloading the Cat_Stud file...') r = requests.get(ebi_download + 'ancestry') if r.status_code == 200: catanc_name = r.headers['Content-Disposition'].split('=')[1] with open(os.path.join(data_path, 'catalog', 'raw', 'Cat_Anc.tsv'), 'wb') as tsvfile: tsvfile.write(r.content) diversity_logger.info('Successfully downloaded ' + catanc_name) else: diversity_logger.debug('Problem downloading the Cat_Anc file...') r = requests.get(ebi_download + 'full') if r.status_code == 200: catfull_name = r.headers['Content-Disposition'].split('=')[1] with open( os.path.join(data_path, 'catalog', 'raw', 'Cat_Full.tsv'), 'wb') as tsvfile: tsvfile.write(r.content) diversity_logger.info('Successfully downloaded ' + catfull_name) else: diversity_logger.debug('Problem downloading the Cat_full file...') requests_ftp.monkeypatch_session() s = requests.Session() ftpsite = 'ftp://ftp.ebi.ac.uk/' subdom = '/pub/databases/gwas/releases/latest/' file = 'gwas-efo-trait-mappings.tsv' r = s.get(ftpsite + subdom + file) if r.status_code == 200: with open(os.path.join(data_path, 'catalog', 'raw', 'Cat_Map.tsv'), 'wb') as tsvfile: tsvfile.write(r.content) diversity_logger.info('Successfully downloaded efo-trait-mapping!') else: diversity_logger.debug('Problem downloading efo-trait-mappings...') except Exception as e: diversity_logger.debug('Problem downloading Catalog data!' + str(e))
def get_gff3_url(self): """gff""" TOPLEVEL = re.compile('{0}.*?.gff3.gz'.format(self.species.capitalize())) url_1st = "ftp://ftp.ensemblgenomes.org/" url_2nd = "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/gff3/" url_3rd_page = url_2nd + self.species.lower() requests_ftp.monkeypatch_session() s = requests.Session() url_page = s.list( "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/gff3/{0}".format(self.species.lower())) url_page.encoding = 'utf-8' download_url = re.findall(TOPLEVEL, url_page.text)[-1] print (download_url) download_url = "".join(download_url) url = url_3rd_page + "/" + download_url print(url) return url
def createForecastCSV(folderEntry): if folderEntry.get() != '' and os.path.exists(os.path.dirname(folderEntry.get())): dateTimeObj = datetime.now() dateTimeObj = dateTimeObj - timedelta(hours=int(dateTimeObj.strftime("%H"))) dayString = dateTimeObj.strftime("%d") monthString = dateTimeObj.strftime("%m") yearString = dateTimeObj.strftime("%Y") url = 'http://ftp1.cptec.inpe.br/modelos/tempo/WRF/ams_05km/recortes/grh/json/' + yearString + '/' + monthString + '/' + dayString + '/00/225.json' requests_ftp.monkeypatch_session() response = requests.get(url) print(response) data = response.text print(data) weather = json.loads(data) hora = int(dateTimeObj.strftime("%H")) print(str(hora)) print(str(dateTimeObj)) timestampStr = dateTimeObj.strftime("%d%b%Y %H") print('Current Timestamp : ', timestampStr) fileOutput = folderEntry.get()+'/forecast.csv' outputFile = open(fileOutput, 'w') #load csv file with open(fileOutput, 'w', newline='') as outputFile: output = csv.writer(outputFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) datasets = weather["datasets"][0] data = datasets["data"] #load json content outputFile.write("B,,PROSA\n") outputFile.write("C,UTC,PRECIP-INC\n") outputFile.write("E,,1HOUR\n") outputFile.write("F,,OBS\n") outputFile.write("Units,,MM\n") outputFile.write("Type,,PER-CUM\n") for i,row in enumerate(data): print(str(hora + i)) outputFile.write(str(i+1) + "," + timestampStr + "00" +', '+ str(row["prec"])) outputFile.write('\n') dateTimeObj = dateTimeObj + timedelta(hours=1) timestampStr = dateTimeObj.strftime("%d%b%Y %H") elif folderEntry.get() == '': messagebox.showinfo('Error', 'Please Select the Destination Folder!') elif not os.path.exists(os.path.dirname(folderEntry.get())): messagebox.showinfo('Error', 'Destination Folder Doesn\'t Exist!')
def download_cat(path, ebi_download): """ download the data from the ebi main site and ftp""" r = requests.get(ebi_download + 'studies_alternative') with open(os.path.join(path, 'Cat_Stud.tsv'), 'wb') as tsvfile: tsvfile.write(r.content) r = requests.get(ebi_download + 'ancestry') with open(os.path.join(path, 'Cat_Anc.tsv'), 'wb') as tsvfile: tsvfile.write(r.content) r = requests.get(ebi_download + 'full') with open(os.path.join(path, 'Cat_Full.tsv'), 'wb') as tsvfile: tsvfile.write(r.content) requests_ftp.monkeypatch_session() s = requests.Session() ftpsite = 'ftp://ftp.ebi.ac.uk/' subdom = '/pub/databases/gwas/releases/latest/' file = 'gwas-efo-trait-mappings.tsv' r = s.get(ftpsite + subdom + file) with open(os.path.join(path, 'Cat_Map.tsv'), 'wb') as tsvfile: tsvfile.write(r.content)
def getftphtmlcontent(url): try: requests_ftp.monkeypatch_session() s = requests.Session() with s.list(url) as r: print(datetime.datetime.now(), '网络返回code:', r.status_code) if r.status_code == 226: content = r.content.decode(encoding='utf-8') print(datetime.datetime.now(), "content ", content) with open('requeste_data.txt', 'a', encoding='utf-8') as f: data = json.dumps(dict(url=url, content=content)).strip() f.write(data + ',') if content == None: content = '' return content else: return None except Exception as e: logging.exception(e) print('erro:', e) return None
def download(): #url = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000006665.1_ASM666v1/" frame = pd.read_csv("type_vi_organisms.csv") requests_ftp.monkeypatch_session() s = requests.Session() for index, row in frame.iterrows(): if index < 150: continue response = s.list(row.ncbi) for line in response.text.split("\r\n"): if line: for entry in line.split(" "): if entry.endswith(".gbff.gz"): result = s.get(row.ncbi+"/"+entry) #with gzip.open('/home/joe/file.txt.gz', 'wb') as f_out: # shutil.copyfileobj(f_in, f_out) with open("C:\\Users\\ag3r\\Downloads\\genomes\\"+entry, "wb") as output: output.write(result.content) print("%s organism %s downloaded" %(index,row.organism)) sleep(5)
def url_to_local_path(url, path, rename=None): """ Copies a file from an http url to a local destination provided in path. Performs file-to-folder converstion :param url: :param path: :return: """ if isdir(path) and '.zip' not in url and '.tar' not in url: new_path = join(path, url.split('/')[-1]) if rename is not None: new_path = join(path, rename) if not url[:3] == 'ftp': r = requests.get(url, stream=True) else: # print 'debug firing' requests_ftp.monkeypatch_session() s = requests.Session() r = s.get(url) # print r.status_code # print r.content if r.status_code in ['226', 200, 226, '200']: if not url[:3] == 'ftp': with open(new_path, 'wb') as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) else: with open(new_path, 'wb') as f: f.write(r.content) else: print r.status_code raise Exception( "Something is wrong with the url provided: %s.\n Please attempt downloading files manually" % url)
def read_file(url): if not os.path.exists('./data'): os.makedirs('./data') file_name = url.split('/')[-1] file_path = './data/{}'.format(file_name) if not read_local or not os.path.isfile(file_path): requests_ftp.monkeypatch_session() s = requests.Session() if url.startswith('ftp://'): reply = s.retr(url, stream=True) else: reply = s.get(url, stream=True) with open(file_path, 'wb') as f: for chunk in reply.iter_content(chunk_size=2048): if chunk: f.write(chunk) f.flush() if file_name.endswith('.gz'): f = gzip.open(file_path, 'rt') else: f = open(file_path, 'rt') cnt = 0 while f: line = f.readline() if line is None or line == '': break cnt += 1 if cnt % 100000 == 0: print('count: ', cnt) yield line
def ftp_fetch(context, data): url = data.get("url") context.log.info("FTP fetch: %s", url) requests_ftp.monkeypatch_session() session = requests.Session() username = context.get("username", "Anonymous") password = context.get("password", "anonymous@ftp") resource = urlparse(url).netloc or url # a bit weird to have a http rate limit while using ftp limit = context.get("http_rate_limit", settings.HTTP_RATE_LIMIT) limit = limit / 60 # per minute to per second for stricter enforcement rate_limit = get_rate_limit(resource, limit=limit, interval=1, unit=1) cached = context.get_tag(url) if cached is not None: context.emit(rule="pass", data=cached) return context.enforce_rate_limit(rate_limit) resp = session.retr(url, auth=(username, password)) if resp.status_code < 399: data.update( { "status_code": resp.status_code, "retrieved_at": datetime.utcnow().isoformat(), "content_hash": context.store_data(data=resp.content), } ) context.set_tag(url, data) context.emit(rule="pass", data=data) else: context.enforce_rate_limit(rate_limit) resp = session.nlst(url, auth=(username, password)) for child in resp.iter_lines(decode_unicode=True): child_data = data.copy() child_data["url"] = os.path.join(url, child) context.log.info("FTP directory child: %(url)s", child_data) context.emit(rule="child", data=child_data)
def url_to_local_p_gz(url, path): """ Copies a file from an http or ftp url to a local destination provided in path :param url: :param path: :return: """ if url[:3] == 'ftp': requests_ftp.monkeypatch_session() s = requests.Session() r = s.retr(url) else: r = requests.get(url, stream=True) if r.status_code in ['226', 200, 226, '200']: r.raw.decode_content = True f_out = open(path, 'wb') f_in = gzip.GzipFile(fileobj=StringIO.StringIO((r.content))) f_out.writelines(f_in) f_out.close() f_in.close() else: raise Exception( "Something is wrong with the url provided: %s.\n Please attempt downloading files manually" % url)
def ftp_fetch(context, data): url = data.get('url') context.log.info("FTP fetch: %s", url) requests_ftp.monkeypatch_session() session = requests.Session() username = context.get('username', 'Anonymous') password = context.get('password', 'anonymous@ftp') resource = urlparse(url).netloc or url # a bit weird to have a http rate limit while using ftp limit = context.get('http_rate_limit', settings.HTTP_RATE_LIMIT) rate_limit = get_rate_limit(resource, limit=limit) cached = context.get_tag(url) if cached is not None: context.emit(rule='pass', data=cached) return rate_limit.comply() resp = session.retr(url, auth=(username, password)) if resp.status_code < 399: data.update({ 'status_code': resp.status_code, 'retrieved_at': datetime.utcnow().isoformat(), 'content_hash': context.store_data(data=resp.content) }) context.set_tag(url, data) context.emit(rule='pass', data=data) else: rate_limit.comply() resp = session.nlst(url, auth=(username, password)) for child in resp.iter_lines(decode_unicode=True): child_data = data.copy() child_data['url'] = os.path.join(url, child) context.log.info("FTP directory child: %(url)s", child_data) context.emit(rule='child', data=child_data)
import itertools import re from os import mkdir from hashlib import md5 from os.path import join, basename, exists, abspath, dirname, splitext from urllib.parse import urlparse from subprocess import check_output from tempfile import mkstemp from hashlib import sha1 from shutil import move from time import time import requests import requests_ftp requests_ftp.monkeypatch_session() # HTTP timeout in seconds, used in various calls to requests.get() and requests.post() _http_timeout = 180 from .compat import csvopen, csvDictWriter from .conform import X_FIELDNAME, Y_FIELDNAME, GEOM_FIELDNAME def mkdirsp(path): try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise
def check_url(url, auth, org_check): """Check whether the given URL is dead or alive. Returns a dict with four keys: "url": The URL that was checked (string) "alive": Whether the URL was working, True or False "status": The HTTP status code of the response from the URL, e.g. 200, 401, 500 (int) "reason": The reason for the success or failure of the check, e.g. "OK", "Unauthorized", "Internal Server Error" (string) The "status" may be None if we did not get a valid HTTP response, e.g. in the event of a timeout, DNS failure or invalid HTTP response. The "reason" will always be a string, but may be a requests library exception string rather than an HTTP reason string if we did not get a valid HTTP response. """ data_provider_credentials = auth #Auth for CMEMS result = {"url": url} try: if "ftp://" in url: #Connection for FTP protocol print("Ftp Connection") requests_ftp.monkeypatch_session() #Adds helpers for FTPConnection s = requests.Session() #Raises request session with FTPAdapter response = s.get(url, auth=(data_provider_credentials[0], data_provider_credentials[1])) else: #Http/Https request s = requests.Session() if data_provider_credentials[0]: #Connection that needs auth print("Connection with credentials") s.auth = (data_provider_credentials[0], data_provider_credentials[1]) response = s.get(url) time.sleep(3) else: #Connection that doesnt need auth print("Connection without credentials") response = s.get(url) result["status"] = response.status_code result["reason"] = response.reason response.raise_for_status() # Raise if status_code is not OK. result["alive"] = True except AttributeError as err: if err.message == "'NoneType' object has no attribute 'encode'": # requests seems to throw these for some invalid URLs. result["alive"] = False result["reason"] = "Invalid URL" result["status"] = None else: raise except requests.exceptions.RequestException as err: result["alive"] = False if "reason" not in result: result["reason"] = str(err) if "status" not in result: # This can happen if the response is invalid HTTP, if we get a DNS # failure, or a timeout, etc. result["status"] = None # We should always have these four fields in the result. assert "url" in result assert result.get("alive") in (True, False) assert "status" in result assert "reason" in result return result
def __init__(self, host): requests_ftp.monkeypatch_session() self.s = requests.Session() self.host = host
import requests import requests_ftp from myftp import StcakQueue requests_ftp.monkeypatch_session() #补丁,一个特定效果的适应性改变代码 url = 'ftp://*.*.*.*' #获取ftp路径 url_temp = url #用temp存储没转换之前的路径名称 url = url.encode('utf-8').decode('latin1') #将路径先编码再解码,解决路径中含有中文名问题 s = requests.Session() #实例化 res = s.list(url, auth=('', '')) #进行ftp连接 res.encoding = 'utf-8' url = url_temp #将路径名赋为没编码之前路径,解决编码之后路径查找不到问题 print(res.text) #输出文件类型,文件名,日期等信息;是一个字符串 str = res.text.split('\r\n') #按换行符将字符串分割 queue = StcakQueue() #定义队列,存储每个文件信息 for i in range(0, len(str) - 1): #将根目录下每个文件夹提取 a = str[i].split() #按空格分割每行字符串 b = a #获取字符串列表 if b[0] == 'drwxr-xr-x': #是文件夹则压入队列 if len(b) > 9: #整合文件名,由于文件名可能带空格,因此需要整合完整的文件名,第八个字符串之后为文件名 name = b[8] for i in range(9, len(b)): name += ' ' + b[i] queue.enqueue(name) #文件名压入队列 else: queue.enqueue(b[8]) urls = StcakQueue() #存储ftp路径 urls.enqueue(url) #将初始路径入队 temp = StcakQueue() #用于更换队列 num = StcakQueue() #计数 while queue.is_empty() == True: #或队列此时为空,则代表这一层次的文件名被访问完 url_1 = urls.top()
def pytest_configure(config): requests_ftp.monkeypatch_session()
def run_data_preparation(source_path, destination_path, dict_urls, kraken_usage): try: if not path.exists(destination_path): makedirs(destination_path) if (kraken_usage): dest_raw_path = str( Path(destination_path).resolve().joinpath('kraken')) else: dest_raw_path = str( Path(destination_path).resolve().joinpath('raw')) if not path.exists(dest_raw_path): makedirs(dest_raw_path) if not path.exists(source_path): makedirs(source_path) if len(dict_urls): # data come from URLs for new_file_name, url in dict_urls.items(): # m1 = re.search('google\.com.+id=(.+)\&*', url) ## Retired m1 = re.search('google\.com\/file\/d\/(.+)\/', url) if m1: # Use GoogleDriveDownloader module id = m1.group(1) response = GoogleDriveDownloader.get_response(id) else: # Direct download if not re.match(r'^(http|https|ftp)://', url): url = 'http://' + url requests_ftp.monkeypatch_session() session = requests.Session() response = session.get(url, stream=True) # Check file existing if response.ok == False: return -1 else: # Get the file name and extension if m1: m2 = re.search('filename="(.+)"', response.headers['Content-Disposition']) file_name = m2.group(1) else: file_name = response.url.split('/')[-1] extension = path.splitext(file_name)[1] # Check file format if new_file_name == 'reference.fa': if not re.search('\.(fasta|fa|fna)+(\.gz)*$', file_name): return -1 else: if not re.search('\.f(ast)*q(\.gz)*$', file_name): return -1 # Save a downloaded file to disk if extension == '.gz': with open( str( Path(source_path).resolve().joinpath( new_file_name)), "wb") as destination: with gzip.GzipFile(fileobj=response.raw) as source: shutil.copyfileobj(source, destination) else: with open( str( Path(source_path).resolve().joinpath( new_file_name)), "wb") as destination: for chunk in response.iter_content(32768): if chunk: # filter out keep-alive new chunks destination.write(chunk) #save files to UID/raw #shutil.copy2(str(Path(source_path).resolve()), dest_raw_path) shutil.copy2(str(Path(source_path).resolve().joinpath('R1.fastq')), dest_raw_path) shutil.copy2(str(Path(source_path).resolve().joinpath('R2.fastq')), dest_raw_path) shutil.rmtree(source_path) return 0 except Exception as e: print(e) return -1
import warnings import numpy as np import requests import pandas.compat as compat from pandas import Panel, DataFrame from pandas import read_csv from pandas.io.common import urlencode from pandas.compat import StringIO, bytes_to_str from pandas_datareader._utils import (RemoteDataError, SymbolWarning, _sanitize_dates, _init_session) import requests_ftp requests_ftp.monkeypatch_session() class _BaseReader(object): """ Parameters ---------- sym : string with a single Single stock symbol (ticker). start : string, (defaults to '1/1/2010') Starting date, timestamp. Parses many different kind of date representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') end : string, (defaults to today) Ending date, timestamp. Same format as starting date. retry_count : int, default 3
class FTP_REQUESTS(object): ''' 通过requests操作FTP文件 其实底层还是FTP还是Soket 可以用于Web项目使用作参考(其实也没有那个必要) ''' import requests, requests_ftp requests_ftp.monkeypatch_session() s = requests.Session() def __init__(self, ftp_url: str = 'ftp://ftp.ptree.jaxa.jp', username: str = '15174506817_163.com', password: str = 'SP+wari8', s: requests_ftp.ftp.FTPSession = s): self.s = s self.ftp_url = ftp_url self.username = username self.password = password # 时间转换url def get_timeToUrl(self, ftp_url: str = None, tim=None): ''' :param ftp_url: FTP 远程路径 :param tim: 时间 :return: url 默认返回路径:ftp://ftp.ptree.jaxa.jp/jma/hsd/ ''' strp = '' strf = '' if ftp_url == None: ftp_url = self.ftp_url + '/jma/hsd/' if tim == None: if bool(re.search(r'^[\d]{9,10}$', str(tim))): strp = "%Y%m%d%H" strf = '%Y%m/%d/%H/' elif bool(re.search(r'^[\d]{7,8}$', str(tim))): strp = "%Y%m%d" strf = '%Y%m/%d/' elif bool(re.search(r'\d{6}', str(tim))): strp = "%Y%m" strf = '%Y%m/' if strp != '' and strf != '': ftp_url = time.strftime(ftp_url + strf, time.strptime(str(tim), strp)) return ftp_url # FTP根据url和过滤条件,返回一个list def get_ftp_urls(self, remotepath, conditions=None): ''' :param remotepath: ftp远程路径 :param conditions: 过滤条件 :return: list result ''' socket.setdefaulttimeout(6) try: if remotepath == None: remotepath = self.ftp_url resp = self.s.list(remotepath, auth=(self.username, self.password)) datas_urls = [] if resp.status_code == 226: print('226 Transfer complete') if conditions is not None: fliter_name = '.*' + conditions + '.*' for i in resp.text.split('\n'): s = re.finditer(fliter_name, i) for i in s: datas_urls.append(i.group()) else: for i in resp.text.split('\n'): datas_urls.append(i) elif 400 <= resp.status_code < 500: if resp.status_code == 404: print("目录或文件不存在!") raise u'%s Client Error: %s for url: %s' % (resp.status_code, remotepath) return datas_urls except(socket.error, socket.gaierror): print("\033[0;32;40mERROR: 链接超时: [{}:{}]\033[0m".format('get_ftp_urls', remotepath)) return None def download_file(self, ftp_file_path: str or FTPFileApi, dst_file_path): """ 从ftp下载文件到本地 :param ftp_file_path: ftp下载文件 :param dst_file_path: 本地存放 :return: """ if isinstance(ftp_file_path, FTPFileApi): remote_file = ftp_file_path.remotepath # 文件总大小 remote_file_size = ftp_file_path.size else: remote_file = ftp_file_path # 文件总大小 remote_file_size = self.s.size(remote_file, auth=(self.username, self.password)) if 400 <= remote_file_size.status_code < 500: if remote_file_size.status_code == 404: print("目录或文件不存在!") # raise (u'%s Client Error: %s for url: %s' % (remote_file_size.status_code, remote_file)) return 0 else: remote_file_size = int(remote_file_size.headers.get('Content-Length')) print('remote filesize [{}]'.format(remote_file_size)) cmpsize = 0 # 下载文件初始大小 lsize = 0 # check local file isn't exists and get the local file size # 实现断点续传 if os.path.exists(dst_file_path): lsize = os.stat(dst_file_path).st_size if lsize >= remote_file_size: print('local file({}b) is bigger or equal remote file({}b)'.format(lsize, remote_file_size)) return 1 start = time.time() headers = {'Range': 'bytes={}-'.format(lsize)} retrs = self.s.retr(remote_file, auth=(self.username, self.password), headers=headers, stream=True) if 400 <= retrs.status_code < 500: if retrs.status_code == 404: print("目录或文件不存在!") raise u'%s Client Error: %s for url: %s' % (retrs.status_code, remote_file) return 0 with open(dst_file_path, "ab") as data: data.write(retrs.content) end = time.time() print(remote_file + '完成!花费时间:', (end - start))