def test_ftp(directory):
    path, port = directory
    ftp = FTP(path,
              host="127.0.0.1",
              listen_host="127.0.0.1",
              listen_port=port)

    assert (ftp.uri == "ftp://%s:%s/" % ("127.0.0.1", port))

    # Test that service start
    ftp.start()

    with pytest.raises(OPVDMException):
        ftp.start()

    # Check service is running
    assert (ftp.is_running())

    # Connect to server

    requests_ftp.monkeypatch_session()
    s = requests.Session()

    r = s.retr("%s%s" % (ftp.uri, "toto.txt"))

    assert (r.status_code == 226)
    assert (r.text == CONTENT_TEST)

    # Stop http test service
    ftp.stop()

    assert (ftp.is_running() is False)
Пример #2
0
def ftp_fetch(context, data):
    url = data.get('url')
    context.log.info("FTP fetch: %s", url)
    requests_ftp.monkeypatch_session()
    session = requests.Session()
    username = context.get('username', 'Anonymous')
    password = context.get('password', 'anonymous@ftp')

    cached = context.get_tag(url)
    if cached is not None:
        context.emit(rule='pass', data=cached)
        return

    resp = session.retr(url, auth=(username, password))
    if resp.status_code < 399:
        data.update({
            'status_code': resp.status_code,
            'retrieved_at': datetime.utcnow().isoformat(),
            'content_hash': context.store_data(data=resp.content)
        })
        context.set_tag(url, data)
        context.emit(rule='pass', data=data)
    else:
        resp = session.nlst(url, auth=(username, password))
        for child in resp.iter_lines():
            child_data = data.copy()
            child_data['url'] = os.path.join(url, child)
            # context.log.info("FTP directory child: %(url)s", child_data)
            context.emit(rule='child', data=child_data)
Пример #3
0
def make_http_request(url, **kwargs):
    """
    Makes http request using requests library
    :param url: URL to query
    :return: request Object
    """
    if 'requests_session' in kwargs:
        s = kwargs['requests_session']
    else:
        s = requests

    r = []
    try:
        r = s.get(url)  #requests.get(url)
        # determine if require request.post to set cookies
        if 'set_cookies' in kwargs:
            if kwargs['set_cookies'] == 'yes':
                cookies = dict(r.cookies)
                r = s.post(url, verify=True, cookies=cookies)
    except requests.exceptions.InvalidSchema:  # if url is ftp rather than http
        requests_ftp.monkeypatch_session()
        r = requests.Session().get(url)
    except requests.exceptions.ConnectionError:
        log.error("URL Connection Error for %s", url)
    try:
        r.raise_for_status()
    except requests.exceptions.HTTPError:
        log.error('Error in URL request!')
    return r
Пример #4
0
def fetch_url(url: str) -> bytes:
    logger = logging.getLogger(__name__)

    if url.startswith("http"):
        logger.debug("fetching url (http) \"%s\"", url)
        response = requests.get(url)
    elif url.startswith("ftp"):
        logger.debug("fetching url (ftp) \"%s\"", url)
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        response = s.get(url)
        s.close()
    else:
        logger.exception("invalid url \"%s\"", url)
        raise requests.HTTPError("invalid url \"%s\"", url)

    if response.ok:
        # fix for url's that return 200 instead of a 404
        if int(response.headers.get("Content-Length", 0)) < 1000:
            logger.exception("Content-Length is very small"
                             "(url is most likely not a valid file)")
            raise requests.HTTPError("Content-Length is very small"
                                     "(url is most likely not a valid file)")

        return response.content
    else:
        logger.exception("failed to fetch url \"%s\" (%i)", url,
                         response.status_code)
        raise requests.HTTPError("failed to fetch url \"%s\" (%i)", url,
                                 response.status_code)
Пример #5
0
def download_data(file_name):
    requests_ftp.monkeypatch_session()
    resp = requests.Session().list('ftp://ftp.dd-wrt.com/betas/2020/')
    file_path = open(file_name, 'w')
    file_path.write((resp.content).decode('utf-8'))
    file_path.close()
    if (file_name == new_path):
        compare_files()
Пример #6
0
def scars(r0):
    try:
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        # really f*****g slow?
        r1 = s.get(r0)
        s.close()
        return r1.content
    except:
        return
    return
Пример #7
0
def ftp():
    proxies = {'https': p_list[random.randint(0, len(p_list) - 1)]}
    print(proxies)
    requests_ftp.monkeypatch_session()
    with requests.Session() as s:
        s.proxies.update(proxies)
        # r = s.get(r'http://jsonip.com', headers=headers)
        # ip= r.json()['ip']
        # print('Your IP is', ip)
        resp = s.list('ftp://90.130.70.73/', auth=('anonymous', 'anonymous'))
        print(resp)
 def download_a_file(self, file_name, file_address):
     """Download a specific file"""
     requests_ftp.monkeypatch_session()
     session_requests = requests.Session()
     try:
         res = session_requests.get(file_address, stream=True)
         with open(file_name, 'wb') as output:
             shutil.copyfileobj(res.raw, output)
         session_requests.close()
     except Exception as e:
         logging.exception('Failed to download {}.'.format(file_name))
         return False
     return True
Пример #9
0
    def __init__(self, url, port=21, remote=None, filename=None, local='.', user='******', password='******'):
        super(self.__class__, self).__init__(url, port, remote, filename, local, user, password)

        if self.port is None:
            self.port = 21
        if self.user is None:
            self.user = '******'
        if self.password is None:
            self.password = '******'

        #self.ftp = ftplib.FTP()
        requests_ftp.monkeypatch_session()
        self.s = requests.Session()
Пример #10
0
    def latest(self):
        """Download latest available copy of clinvar database in vcf format."""
        import requests
        import requests_ftp
        import gzip

        requests_ftp.monkeypatch_session()
        with requests.Session() as sess:
            resp = sess.get('{}/clinvar.vcf.gz'.format(self.base_url))

        self.rawdata = gzip.decompress(resp.content)

        return self.rawdata
Пример #11
0
def confirm_urls(request):
    if request.method == 'POST':
        confirm_result = {'R1': False, 'R2':False, 'Rf': False, 'Lr': False, 'R1_err':'', 'R2_err':'', 'Rf_err':'', 'Lr_err':''}
        for key, val in request.POST.items():
            is_status_ok = False
            is_format_ok = False
            
            if key[:3] == 'url': # e.g., key = url_R1
                # m1 = re.search('google\.com.+id=(.+)\&*', val) ## Retired
                m1 = re.search('google\.com\/file\/d\/(.+)\/', val)
                if m1:
                    # use GoogleDriveDownloader module
                    id = m1.group(1)
                    response = GoogleDriveDownloader.get_response(id)
                else:
                    # direct download
                    if not re.match(r'^(http|https|ftp)://', val):
                        val = 'http://'+val
                    requests_ftp.monkeypatch_session()
                    session = requests.Session()
                    response = session.get(val, stream=True)
                    
                # Check file existing
                is_status_ok = response.ok
                if is_status_ok == False:
                    confirm_result[key[-2:]+'_err'] = 'File Not Found'
                else:
                    # Check file format
                    if m1:
                        m2 = re.search('filename="(.+)"', response.headers['Content-Disposition'])
                        file_name = m2.group(1)
                    else:
                        file_name = response.url.split('/')[-1]
                    
                    if key[-2:] == 'Rf':
                        if re.search('\.(fasta|fa|fna)+(\.gz)*$', file_name):
                            is_format_ok = True
                        else:
                            is_format_ok = False
                            confirm_result[key[-2:]+'_err'] = 'Unkown File Format'
                    else: #R1/R2/Lr
                        if re.search('\.f(ast)*q(\.gz)*$', file_name):
                            is_format_ok = True
                        else:
                            is_format_ok = False
                            confirm_result[key[-2:]+'_err'] = 'Unkown File Format'
                        
                confirm_result[key[-2:]] = is_status_ok and is_format_ok

        return HttpResponse(json.dumps(confirm_result))
Пример #12
0
def make_http_request(url):
    r = []
    try:
        r = requests.get(url)
    except requests.exceptions.InvalidSchema:  # if url is ftp rather than http
        requests_ftp.monkeypatch_session()
        r = requests.Session().get(url)
    except requests.exceptions.ConnectionError:
        log.error("URL Connection Error for " + url)
    try:
        r.raise_for_status()
    except requests.exceptions.HTTPError:
        log.error('Error in URL request!')
    return r
    def _crawl_urls_ftp(self, url, provider):
        """
        Check if a file is present on an FTP server and return the appropriate
        status code.
        """
        # We need to be able to mock this for testing and requests-mock doesn't
        # work with requests-ftp, so this is our workaround. We'll just bypass
        # this method like so (the real method returns either an int or None):
        test_ftp_status = self.source_config.get('test_ftp_status')
        if test_ftp_status == 'ok':
            return 10000
        elif test_ftp_status == 'error':
            return None

        # And now here's the real method:
        timeout = self.source_config['timeout']
        username = self.source_config['username']
        password = self.source_config['password']

        # Make a request to the website
        timestamp = str(datetime.utcnow())
        log_message = '{:<12} | {} | {} | {}s'
        try:
            requests_ftp.monkeypatch_session()
            s = requests.Session()
            r = s.size(url,
                       auth=HTTPBasicAuth(username, password),
                       timeout=timeout)
            status_code = r.status_code
            elapsed = r.elapsed.total_seconds()
        except (ConnectTimeout, ReadTimeout) as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            status_code = 408
            elapsed = 9999

        if status_code == 213:
            size = int(r.text)
        else:
            size = None

        if status_code not in {213, 408}:
            self._save_gather_error('{} error: {}'.format(status_code, r.text),
                                    self.job)

        self.provider_logger.info(
            log_message.format(provider, timestamp, status_code, elapsed))
        return size
Пример #14
0
 def get_cds_url(self):
     """cds"""
     TOPLEVEL = re.compile('{0}.*?.cds.all.fa.gz'.format(self.species.capitalize()))
     url_1st = "ftp://ftp.ensemblgenomes.org/"
     url_2nd = "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/fasta/"
     url_3rd_page = url_2nd + self.species.lower() + "/cds/"
     requests_ftp.monkeypatch_session()
     s = requests.Session()
     url_page = s.list(
         "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/fasta/{0}/cds".format(self.species.lower()))
     url_page.encoding = 'utf-8'
     download_url = re.findall(TOPLEVEL, url_page.text)
     download_url = "".join(download_url)
     url = url_3rd_page + download_url
     print(url)
     return url
Пример #15
0
def download_cat(data_path, ebi_download):
    """ download the data from the ebi main site and ftp"""
    try:
        r = requests.get(ebi_download + 'studies_alternative')
        if r.status_code == 200:
            catstud_name = r.headers['Content-Disposition'].split('=')[1]
            with open(
                    os.path.join(data_path, 'catalog', 'raw', 'Cat_Stud.tsv'),
                    'wb') as tsvfile:
                tsvfile.write(r.content)
            diversity_logger.info('Successfully downloaded ' + catstud_name)
        else:
            diversity_logger.debug('Problem downloading the Cat_Stud file...')
        r = requests.get(ebi_download + 'ancestry')
        if r.status_code == 200:
            catanc_name = r.headers['Content-Disposition'].split('=')[1]
            with open(os.path.join(data_path, 'catalog', 'raw', 'Cat_Anc.tsv'),
                      'wb') as tsvfile:
                tsvfile.write(r.content)
            diversity_logger.info('Successfully downloaded ' + catanc_name)
        else:
            diversity_logger.debug('Problem downloading the Cat_Anc file...')
        r = requests.get(ebi_download + 'full')
        if r.status_code == 200:
            catfull_name = r.headers['Content-Disposition'].split('=')[1]
            with open(
                    os.path.join(data_path, 'catalog', 'raw', 'Cat_Full.tsv'),
                    'wb') as tsvfile:
                tsvfile.write(r.content)
            diversity_logger.info('Successfully downloaded ' + catfull_name)
        else:
            diversity_logger.debug('Problem downloading the Cat_full file...')
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        ftpsite = 'ftp://ftp.ebi.ac.uk/'
        subdom = '/pub/databases/gwas/releases/latest/'
        file = 'gwas-efo-trait-mappings.tsv'
        r = s.get(ftpsite + subdom + file)
        if r.status_code == 200:
            with open(os.path.join(data_path, 'catalog', 'raw', 'Cat_Map.tsv'),
                      'wb') as tsvfile:
                tsvfile.write(r.content)
            diversity_logger.info('Successfully downloaded efo-trait-mapping!')
        else:
            diversity_logger.debug('Problem downloading efo-trait-mappings...')
    except Exception as e:
        diversity_logger.debug('Problem downloading Catalog data!' + str(e))
Пример #16
0
 def get_gff3_url(self):
     """gff"""
     TOPLEVEL = re.compile('{0}.*?.gff3.gz'.format(self.species.capitalize()))
     url_1st = "ftp://ftp.ensemblgenomes.org/"
     url_2nd = "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/gff3/"
     url_3rd_page = url_2nd + self.species.lower()
     requests_ftp.monkeypatch_session()
     s = requests.Session()
     url_page = s.list(
         "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/gff3/{0}".format(self.species.lower()))
     url_page.encoding = 'utf-8'
     download_url = re.findall(TOPLEVEL, url_page.text)[-1]
     print (download_url)
     download_url = "".join(download_url)
     url = url_3rd_page + "/" + download_url
     print(url)
     return url
def createForecastCSV(folderEntry):
    if folderEntry.get() != '' and os.path.exists(os.path.dirname(folderEntry.get())):
        dateTimeObj = datetime.now()
        dateTimeObj = dateTimeObj - timedelta(hours=int(dateTimeObj.strftime("%H")))
        dayString = dateTimeObj.strftime("%d")
        monthString = dateTimeObj.strftime("%m")
        yearString = dateTimeObj.strftime("%Y")
        url = 'http://ftp1.cptec.inpe.br/modelos/tempo/WRF/ams_05km/recortes/grh/json/' + yearString + '/' + monthString + '/' + dayString + '/00/225.json'
        requests_ftp.monkeypatch_session()
        response = requests.get(url)
        print(response)
        data = response.text
        print(data)
        weather = json.loads(data)

        hora = int(dateTimeObj.strftime("%H"))
        print(str(hora))
        print(str(dateTimeObj))
        timestampStr = dateTimeObj.strftime("%d%b%Y %H")
        
        print('Current Timestamp : ', timestampStr)

        fileOutput = folderEntry.get()+'/forecast.csv'
        outputFile = open(fileOutput, 'w') #load csv file
        with open(fileOutput, 'w', newline='') as outputFile:
            output = csv.writer(outputFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            datasets = weather["datasets"][0]
            data = datasets["data"] #load json content
            outputFile.write("B,,PROSA\n")
            outputFile.write("C,UTC,PRECIP-INC\n")
            outputFile.write("E,,1HOUR\n")
            outputFile.write("F,,OBS\n")
            outputFile.write("Units,,MM\n")
            outputFile.write("Type,,PER-CUM\n")

            for i,row in enumerate(data):
                print(str(hora + i))
                outputFile.write(str(i+1) + "," + timestampStr + "00" +', '+ str(row["prec"]))
                outputFile.write('\n')
                dateTimeObj = dateTimeObj + timedelta(hours=1)
                timestampStr = dateTimeObj.strftime("%d%b%Y %H")

    elif folderEntry.get() == '':
        messagebox.showinfo('Error', 'Please Select the Destination Folder!')
    elif not os.path.exists(os.path.dirname(folderEntry.get())):
        messagebox.showinfo('Error', 'Destination Folder Doesn\'t Exist!')
Пример #18
0
def download_cat(path, ebi_download):
    """ download the data from the ebi main site and ftp"""
    r = requests.get(ebi_download + 'studies_alternative')
    with open(os.path.join(path, 'Cat_Stud.tsv'), 'wb') as tsvfile:
        tsvfile.write(r.content)
    r = requests.get(ebi_download + 'ancestry')
    with open(os.path.join(path, 'Cat_Anc.tsv'), 'wb') as tsvfile:
        tsvfile.write(r.content)
    r = requests.get(ebi_download + 'full')
    with open(os.path.join(path, 'Cat_Full.tsv'), 'wb') as tsvfile:
        tsvfile.write(r.content)
    requests_ftp.monkeypatch_session()
    s = requests.Session()
    ftpsite = 'ftp://ftp.ebi.ac.uk/'
    subdom = '/pub/databases/gwas/releases/latest/'
    file = 'gwas-efo-trait-mappings.tsv'
    r = s.get(ftpsite + subdom + file)
    with open(os.path.join(path, 'Cat_Map.tsv'), 'wb') as tsvfile:
        tsvfile.write(r.content)
Пример #19
0
def getftphtmlcontent(url):
    try:
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        with s.list(url) as r:
            print(datetime.datetime.now(), '网络返回code:', r.status_code)
            if r.status_code == 226:
                content = r.content.decode(encoding='utf-8')
                print(datetime.datetime.now(), "content ", content)
                with open('requeste_data.txt', 'a', encoding='utf-8') as f:
                    data = json.dumps(dict(url=url, content=content)).strip()
                    f.write(data + ',')
                if content == None:
                    content = ''
                return content
            else:
                return None
    except Exception as e:
        logging.exception(e)
        print('erro:', e)
        return None
Пример #20
0
def download():
	#url = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000006665.1_ASM666v1/"
	frame = pd.read_csv("type_vi_organisms.csv")
	requests_ftp.monkeypatch_session()
	s = requests.Session()

	for index, row in frame.iterrows():
		if index < 150:
			continue
		response = s.list(row.ncbi)
		for line in response.text.split("\r\n"):
			if line:
				for entry in line.split(" "):
					if entry.endswith(".gbff.gz"):
						result = s.get(row.ncbi+"/"+entry)
	    #with gzip.open('/home/joe/file.txt.gz', 'wb') as f_out:
	    #    shutil.copyfileobj(f_in, f_out)					
						with open("C:\\Users\\ag3r\\Downloads\\genomes\\"+entry, "wb") as output:
							output.write(result.content)
						print("%s organism %s downloaded" %(index,row.organism))
						sleep(5)
Пример #21
0
def url_to_local_path(url, path, rename=None):
    """
    Copies a file from an http url to a local destination provided in path.
    Performs file-to-folder converstion
    :param url:
    :param path:
    :return:
    """
    if isdir(path) and '.zip' not in url and '.tar' not in url:
        new_path = join(path, url.split('/')[-1])

        if rename is not None:
            new_path = join(path, rename)

    if not url[:3] == 'ftp':

        r = requests.get(url, stream=True)

    else:
        # print 'debug firing'
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        r = s.get(url)
        # print r.status_code
        # print r.content

    if r.status_code in ['226', 200, 226, '200']:
        if not url[:3] == 'ftp':
            with open(new_path, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
        else:
            with open(new_path, 'wb') as f:
                f.write(r.content)

    else:
        print r.status_code
        raise Exception(
            "Something is wrong with the url provided: %s.\n Please attempt downloading files manually" %
            url)
Пример #22
0
    def read_file(url):
        if not os.path.exists('./data'):
            os.makedirs('./data')

        file_name = url.split('/')[-1]
        file_path = './data/{}'.format(file_name)

        if not read_local or not os.path.isfile(file_path):
            requests_ftp.monkeypatch_session()
            s = requests.Session()

            if url.startswith('ftp://'):
                reply = s.retr(url, stream=True)
            else:
                reply = s.get(url, stream=True)

            with open(file_path, 'wb') as f:
                for chunk in reply.iter_content(chunk_size=2048):
                    if chunk:
                        f.write(chunk)
                        f.flush()

        if file_name.endswith('.gz'):
            f = gzip.open(file_path, 'rt')
        else:
            f = open(file_path, 'rt')

        cnt = 0
        while f:

            line = f.readline()
            if line is None or line == '':
                break

            cnt += 1
            if cnt % 100000 == 0:
                print('count: ', cnt)

            yield line
Пример #23
0
def ftp_fetch(context, data):
    url = data.get("url")
    context.log.info("FTP fetch: %s", url)
    requests_ftp.monkeypatch_session()
    session = requests.Session()
    username = context.get("username", "Anonymous")
    password = context.get("password", "anonymous@ftp")

    resource = urlparse(url).netloc or url
    # a bit weird to have a http rate limit while using ftp
    limit = context.get("http_rate_limit", settings.HTTP_RATE_LIMIT)
    limit = limit / 60  # per minute to per second for stricter enforcement
    rate_limit = get_rate_limit(resource, limit=limit, interval=1, unit=1)

    cached = context.get_tag(url)
    if cached is not None:
        context.emit(rule="pass", data=cached)
        return

    context.enforce_rate_limit(rate_limit)
    resp = session.retr(url, auth=(username, password))
    if resp.status_code < 399:
        data.update(
            {
                "status_code": resp.status_code,
                "retrieved_at": datetime.utcnow().isoformat(),
                "content_hash": context.store_data(data=resp.content),
            }
        )
        context.set_tag(url, data)
        context.emit(rule="pass", data=data)
    else:
        context.enforce_rate_limit(rate_limit)
        resp = session.nlst(url, auth=(username, password))
        for child in resp.iter_lines(decode_unicode=True):
            child_data = data.copy()
            child_data["url"] = os.path.join(url, child)
            context.log.info("FTP directory child: %(url)s", child_data)
            context.emit(rule="child", data=child_data)
Пример #24
0
    def read_file(url):
        if not os.path.exists('./data'):
            os.makedirs('./data')

        file_name = url.split('/')[-1]
        file_path = './data/{}'.format(file_name)

        if not read_local or not os.path.isfile(file_path):
            requests_ftp.monkeypatch_session()
            s = requests.Session()

            if url.startswith('ftp://'):
                reply = s.retr(url, stream=True)
            else:
                reply = s.get(url, stream=True)

            with open(file_path, 'wb') as f:
                for chunk in reply.iter_content(chunk_size=2048):
                    if chunk:
                        f.write(chunk)
                        f.flush()

        if file_name.endswith('.gz'):
            f = gzip.open(file_path, 'rt')
        else:
            f = open(file_path, 'rt')

        cnt = 0
        while f:

            line = f.readline()
            if line is None or line == '':
                break

            cnt += 1
            if cnt % 100000 == 0:
                print('count: ', cnt)

            yield line
Пример #25
0
def url_to_local_p_gz(url, path):
    """
    Copies a file from an http or ftp url to a local destination provided in path
    :param url:
    :param path:
    :return:
    """
    if url[:3] == 'ftp':
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        r = s.retr(url)
    else:
        r = requests.get(url, stream=True)
    if r.status_code in ['226', 200, 226, '200']:
        r.raw.decode_content = True
        f_out = open(path, 'wb')
        f_in = gzip.GzipFile(fileobj=StringIO.StringIO((r.content)))
        f_out.writelines(f_in)
        f_out.close()
        f_in.close()
    else:
        raise Exception(
            "Something is wrong with the url provided: %s.\n Please attempt downloading files manually" %
            url)
Пример #26
0
def url_to_local_p_gz(url, path):
    """
    Copies a file from an http or ftp url to a local destination provided in path
    :param url:
    :param path:
    :return:
    """
    if url[:3] == 'ftp':
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        r = s.retr(url)
    else:
        r = requests.get(url, stream=True)
    if r.status_code in ['226', 200, 226, '200']:
        r.raw.decode_content = True
        f_out = open(path, 'wb')
        f_in = gzip.GzipFile(fileobj=StringIO.StringIO((r.content)))
        f_out.writelines(f_in)
        f_out.close()
        f_in.close()
    else:
        raise Exception(
            "Something is wrong with the url provided: %s.\n Please attempt downloading files manually"
            % url)
Пример #27
0
def ftp_fetch(context, data):
    url = data.get('url')
    context.log.info("FTP fetch: %s", url)
    requests_ftp.monkeypatch_session()
    session = requests.Session()
    username = context.get('username', 'Anonymous')
    password = context.get('password', 'anonymous@ftp')

    resource = urlparse(url).netloc or url
    # a bit weird to have a http rate limit while using ftp
    limit = context.get('http_rate_limit', settings.HTTP_RATE_LIMIT)
    rate_limit = get_rate_limit(resource, limit=limit)

    cached = context.get_tag(url)
    if cached is not None:
        context.emit(rule='pass', data=cached)
        return

    rate_limit.comply()
    resp = session.retr(url, auth=(username, password))
    if resp.status_code < 399:
        data.update({
            'status_code': resp.status_code,
            'retrieved_at': datetime.utcnow().isoformat(),
            'content_hash': context.store_data(data=resp.content)
        })
        context.set_tag(url, data)
        context.emit(rule='pass', data=data)
    else:
        rate_limit.comply()
        resp = session.nlst(url, auth=(username, password))
        for child in resp.iter_lines(decode_unicode=True):
            child_data = data.copy()
            child_data['url'] = os.path.join(url, child)
            context.log.info("FTP directory child: %(url)s", child_data)
            context.emit(rule='child', data=child_data)
Пример #28
0
import itertools
import re

from os import mkdir
from hashlib import md5
from os.path import join, basename, exists, abspath, dirname, splitext
from urllib.parse import urlparse
from subprocess import check_output
from tempfile import mkstemp
from hashlib import sha1
from shutil import move
from time import time

import requests
import requests_ftp
requests_ftp.monkeypatch_session()

# HTTP timeout in seconds, used in various calls to requests.get() and requests.post()
_http_timeout = 180

from .compat import csvopen, csvDictWriter
from .conform import X_FIELDNAME, Y_FIELDNAME, GEOM_FIELDNAME

def mkdirsp(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
Пример #29
0
def check_url(url, auth, org_check):
    """Check whether the given URL is dead or alive.

    Returns a dict with four keys:

        "url": The URL that was checked (string)
        "alive": Whether the URL was working, True or False
        "status": The HTTP status code of the response from the URL,
            e.g. 200, 401, 500 (int)
        "reason": The reason for the success or failure of the check,
            e.g. "OK", "Unauthorized", "Internal Server Error" (string)

    The "status" may be None if we did not get a valid HTTP response,
    e.g. in the event of a timeout, DNS failure or invalid HTTP response.

    The "reason" will always be a string, but may be a requests library
    exception string rather than an HTTP reason string if we did not get a valid
    HTTP response.

    """
    data_provider_credentials = auth  #Auth for CMEMS

    result = {"url": url}
    try:
        if "ftp://" in url:  #Connection for FTP protocol
            print("Ftp Connection")
            requests_ftp.monkeypatch_session()  #Adds helpers for FTPConnection
            s = requests.Session()  #Raises request session with FTPAdapter
            response = s.get(url,
                             auth=(data_provider_credentials[0],
                                   data_provider_credentials[1]))
        else:  #Http/Https request
            s = requests.Session()
            if data_provider_credentials[0]:  #Connection that needs auth
                print("Connection with credentials")
                s.auth = (data_provider_credentials[0],
                          data_provider_credentials[1])
                response = s.get(url)
                time.sleep(3)
            else:  #Connection that doesnt need auth
                print("Connection without credentials")
                response = s.get(url)
        result["status"] = response.status_code
        result["reason"] = response.reason
        response.raise_for_status()  # Raise if status_code is not OK.
        result["alive"] = True
    except AttributeError as err:
        if err.message == "'NoneType' object has no attribute 'encode'":
            # requests seems to throw these for some invalid URLs.
            result["alive"] = False
            result["reason"] = "Invalid URL"
            result["status"] = None
        else:
            raise
    except requests.exceptions.RequestException as err:
        result["alive"] = False
        if "reason" not in result:
            result["reason"] = str(err)
        if "status" not in result:
            # This can happen if the response is invalid HTTP, if we get a DNS
            # failure, or a timeout, etc.
            result["status"] = None

    # We should always have these four fields in the result.
    assert "url" in result
    assert result.get("alive") in (True, False)
    assert "status" in result
    assert "reason" in result
    return result
Пример #30
0
 def __init__(self, host):
     requests_ftp.monkeypatch_session()
     self.s = requests.Session()
     self.host = host
Пример #31
0
import requests
import requests_ftp
from myftp import StcakQueue
requests_ftp.monkeypatch_session()  #补丁,一个特定效果的适应性改变代码
url = 'ftp://*.*.*.*'  #获取ftp路径
url_temp = url  #用temp存储没转换之前的路径名称
url = url.encode('utf-8').decode('latin1')  #将路径先编码再解码,解决路径中含有中文名问题
s = requests.Session()  #实例化
res = s.list(url, auth=('', ''))  #进行ftp连接
res.encoding = 'utf-8'
url = url_temp  #将路径名赋为没编码之前路径,解决编码之后路径查找不到问题
print(res.text)  #输出文件类型,文件名,日期等信息;是一个字符串
str = res.text.split('\r\n')  #按换行符将字符串分割
queue = StcakQueue()  #定义队列,存储每个文件信息
for i in range(0, len(str) - 1):  #将根目录下每个文件夹提取
    a = str[i].split()  #按空格分割每行字符串
    b = a  #获取字符串列表
    if b[0] == 'drwxr-xr-x':  #是文件夹则压入队列
        if len(b) > 9:  #整合文件名,由于文件名可能带空格,因此需要整合完整的文件名,第八个字符串之后为文件名
            name = b[8]
            for i in range(9, len(b)):
                name += ' ' + b[i]
            queue.enqueue(name)  #文件名压入队列
        else:
            queue.enqueue(b[8])
urls = StcakQueue()  #存储ftp路径
urls.enqueue(url)  #将初始路径入队
temp = StcakQueue()  #用于更换队列
num = StcakQueue()  #计数
while queue.is_empty() == True:  #或队列此时为空,则代表这一层次的文件名被访问完
    url_1 = urls.top()
Пример #32
0
def pytest_configure(config):
    requests_ftp.monkeypatch_session()
Пример #33
0
def run_data_preparation(source_path, destination_path, dict_urls,
                         kraken_usage):
    try:
        if not path.exists(destination_path):
            makedirs(destination_path)
        if (kraken_usage):
            dest_raw_path = str(
                Path(destination_path).resolve().joinpath('kraken'))
        else:
            dest_raw_path = str(
                Path(destination_path).resolve().joinpath('raw'))

        if not path.exists(dest_raw_path):
            makedirs(dest_raw_path)
        if not path.exists(source_path):
            makedirs(source_path)

        if len(dict_urls):  # data come from URLs
            for new_file_name, url in dict_urls.items():
                # m1 = re.search('google\.com.+id=(.+)\&*', url) ## Retired
                m1 = re.search('google\.com\/file\/d\/(.+)\/', url)
                if m1:
                    # Use GoogleDriveDownloader module
                    id = m1.group(1)
                    response = GoogleDriveDownloader.get_response(id)
                else:
                    # Direct download
                    if not re.match(r'^(http|https|ftp)://', url):
                        url = 'http://' + url
                    requests_ftp.monkeypatch_session()
                    session = requests.Session()
                    response = session.get(url, stream=True)

                # Check file existing
                if response.ok == False:
                    return -1
                else:
                    # Get the file name and extension
                    if m1:
                        m2 = re.search('filename="(.+)"',
                                       response.headers['Content-Disposition'])
                        file_name = m2.group(1)
                    else:
                        file_name = response.url.split('/')[-1]
                    extension = path.splitext(file_name)[1]

                    # Check file format
                    if new_file_name == 'reference.fa':
                        if not re.search('\.(fasta|fa|fna)+(\.gz)*$',
                                         file_name):
                            return -1
                    else:
                        if not re.search('\.f(ast)*q(\.gz)*$', file_name):
                            return -1

                    # Save a downloaded file to disk
                    if extension == '.gz':
                        with open(
                                str(
                                    Path(source_path).resolve().joinpath(
                                        new_file_name)), "wb") as destination:
                            with gzip.GzipFile(fileobj=response.raw) as source:
                                shutil.copyfileobj(source, destination)
                    else:
                        with open(
                                str(
                                    Path(source_path).resolve().joinpath(
                                        new_file_name)), "wb") as destination:
                            for chunk in response.iter_content(32768):
                                if chunk:  # filter out keep-alive new chunks
                                    destination.write(chunk)

        #save files to UID/raw
        #shutil.copy2(str(Path(source_path).resolve()), dest_raw_path)
        shutil.copy2(str(Path(source_path).resolve().joinpath('R1.fastq')),
                     dest_raw_path)
        shutil.copy2(str(Path(source_path).resolve().joinpath('R2.fastq')),
                     dest_raw_path)
        shutil.rmtree(source_path)
        return 0

    except Exception as e:
        print(e)
        return -1
Пример #34
0
import warnings
import numpy as np

import requests

import pandas.compat as compat
from pandas import Panel, DataFrame
from pandas import read_csv
from pandas.io.common import urlencode
from pandas.compat import StringIO, bytes_to_str

from pandas_datareader._utils import (RemoteDataError, SymbolWarning,
                                      _sanitize_dates, _init_session)

import requests_ftp
requests_ftp.monkeypatch_session()


class _BaseReader(object):

    """

    Parameters
    ----------
        sym : string with a single Single stock symbol (ticker).
        start : string, (defaults to '1/1/2010')
                Starting date, timestamp. Parses many different kind of date
                representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
        end : string, (defaults to today)
                Ending date, timestamp. Same format as starting date.
        retry_count : int, default 3
Пример #35
0
class FTP_REQUESTS(object):
    '''
        通过requests操作FTP文件
        其实底层还是FTP还是Soket
        可以用于Web项目使用作参考(其实也没有那个必要)
    '''
    import requests, requests_ftp
    requests_ftp.monkeypatch_session()
    s = requests.Session()

    def __init__(self, ftp_url: str = 'ftp://ftp.ptree.jaxa.jp', username: str = '15174506817_163.com',
                 password: str = 'SP+wari8', s: requests_ftp.ftp.FTPSession = s):
        self.s = s
        self.ftp_url = ftp_url
        self.username = username
        self.password = password

    # 时间转换url
    def get_timeToUrl(self, ftp_url: str = None, tim=None):
        '''
        :param ftp_url: FTP 远程路径
        :param tim: 时间
        :return: url
        默认返回路径:ftp://ftp.ptree.jaxa.jp/jma/hsd/
        '''
        strp = ''
        strf = ''
        if ftp_url == None:
            ftp_url = self.ftp_url + '/jma/hsd/'
        if tim == None:
            if bool(re.search(r'^[\d]{9,10}$', str(tim))):
                strp = "%Y%m%d%H"
                strf = '%Y%m/%d/%H/'
            elif bool(re.search(r'^[\d]{7,8}$', str(tim))):
                strp = "%Y%m%d"
                strf = '%Y%m/%d/'
            elif bool(re.search(r'\d{6}', str(tim))):
                strp = "%Y%m"
                strf = '%Y%m/'
            if strp != '' and strf != '':
                ftp_url = time.strftime(ftp_url + strf, time.strptime(str(tim), strp))
        return ftp_url

    # FTP根据url和过滤条件,返回一个list
    def get_ftp_urls(self, remotepath, conditions=None):
        '''
        :param remotepath: ftp远程路径
        :param conditions: 过滤条件
        :return: list result
        '''
        socket.setdefaulttimeout(6)
        try:
            if remotepath == None:
                remotepath = self.ftp_url
            resp = self.s.list(remotepath, auth=(self.username, self.password))
            datas_urls = []
            if resp.status_code == 226:
                print('226  Transfer complete')
                if conditions is not None:
                    fliter_name = '.*' + conditions + '.*'
                    for i in resp.text.split('\n'):
                        s = re.finditer(fliter_name, i)
                        for i in s:
                            datas_urls.append(i.group())
                else:
                    for i in resp.text.split('\n'):
                        datas_urls.append(i)
            elif 400 <= resp.status_code < 500:
                if resp.status_code == 404:
                    print("目录或文件不存在!")
                raise u'%s Client Error: %s for url: %s' % (resp.status_code, remotepath)
            return datas_urls
        except(socket.error, socket.gaierror):
            print("\033[0;32;40mERROR: 链接超时: [{}:{}]\033[0m".format('get_ftp_urls', remotepath))
        return None

    def download_file(self, ftp_file_path: str or FTPFileApi, dst_file_path):
        """
        从ftp下载文件到本地
        :param ftp_file_path: ftp下载文件
        :param dst_file_path: 本地存放
        :return:
        """
        if isinstance(ftp_file_path, FTPFileApi):
            remote_file = ftp_file_path.remotepath
            # 文件总大小
            remote_file_size = ftp_file_path.size
        else:
            remote_file = ftp_file_path
            # 文件总大小
            remote_file_size = self.s.size(remote_file,
                                           auth=(self.username, self.password))
        if 400 <= remote_file_size.status_code < 500:
            if remote_file_size.status_code == 404:
                print("目录或文件不存在!")
                # raise (u'%s Client Error: %s for url: %s' % (remote_file_size.status_code, remote_file))
            return 0
        else:
            remote_file_size = int(remote_file_size.headers.get('Content-Length'))
            print('remote filesize [{}]'.format(remote_file_size))
        cmpsize = 0  # 下载文件初始大小
        lsize = 0
        # check local file isn't exists and get the local file size
        # 实现断点续传
        if os.path.exists(dst_file_path):
            lsize = os.stat(dst_file_path).st_size
            if lsize >= remote_file_size:
                print('local file({}b) is bigger or equal remote file({}b)'.format(lsize, remote_file_size))
                return 1
        start = time.time()

        headers = {'Range': 'bytes={}-'.format(lsize)}

        retrs = self.s.retr(remote_file,
                            auth=(self.username, self.password), headers=headers, stream=True)

        if 400 <= retrs.status_code < 500:
            if retrs.status_code == 404:
                print("目录或文件不存在!")
            raise u'%s Client Error: %s for url: %s' % (retrs.status_code, remote_file)
            return 0

        with open(dst_file_path, "ab") as data:
            data.write(retrs.content)
        end = time.time()
        print(remote_file + '完成!花费时间:', (end - start))
Пример #36
0
def pytest_configure(config):
    requests_ftp.monkeypatch_session()