def read_pages(self): pages = [] k = Key(FileInfo.objects.s3_bucket) k.key = self.object_key + "/" + self.sha1 h = k.read(100) page_size = int(h[16:18].encode("hex"), 16) n_pages = int(h[28:32].encode("hex"), 16) page1 = h + k.read(page_size - 100) pages.append(page1) for i in range(2, n_pages + 1): page = k.read(page_size) pages.append(page) return pages
def login(): error = '' file_contents = '' global login_failed keys = [] if(login_failed < 2): if request.method == 'POST': username_form = request.form['username'] bucket = conn.get_bucket(config.buck_name, validate=True) k = Key(bucket) k.key = 'auth_users.txt' k.open() file_contents = k.read() if username_form in file_contents: session['logged_in'] = True session['username'] = username_form #session.permanent = True #app.permanent_session_lifetime = timedelta(seconds=300) return render_template('upload_db.html', username = session['username']) else: login_failed = login_failed+1 error+= "Invalid Username. Login Again" return render_template('welcome.html', error = error) else: login_failed = 0 error = 'You have exceeded maximum attempts for failed login. Locked Out. Try agin after 30 mins' return render_template('welcome.html', error = error)
def s3_url(self, is_compressed=False): k = Key(FileInfo.objects.s3_bucket) if self.is_diff_file: k.key = self.object_key + "/diff/" + self.diff_name if is_compressed: k_compressed = FileInfo.objects.s3_bucket.get_key(self.object_key + "/diff/gz/" + self.diff_name) if k_compressed: k = k_compressed else: zipstream = StringIO.StringIO() gzipper = gzip.GzipFile(mode="w", fileobj=zipstream) gzipper.write(k.read()) gzipper.close() k.key = self.object_key + "/diff/gz/" + self.diff_name k.metadata.update( { "Content-Type": str("application/sqlite3-diff"), "Content-Disposition": str("attachment;filename=" + self.name() + "-diff"), "Content-Encoding": str("gzip"), } ) k.set_contents_from_string(zipstream.getvalue()) k = FileInfo.objects.s3_bucket.get_key(self.object_key + "/diff/gz/" + self.diff_name) else: k.key = self.object_key + "/" + self.sha1 if is_compressed: k_compressed = FileInfo.objects.s3_bucket.get_key(self.object_key + "/gz/" + self.sha1) if k_compressed: k = k_compressed else: zipstream = StringIO.StringIO() gzipper = gzip.GzipFile(mode="w", fileobj=zipstream) gzipper.write(k.read()) gzipper.close() k.key = self.object_key + "/gz/" + self.sha1 k.metadata.update( { "Content-Type": str(self.file_format.mime_type.text), "Content-Disposition": str("attachment;filename=" + self.name()), "Content-Encoding": str("gzip"), } ) k.set_contents_from_string(zipstream.getvalue()) k = FileInfo.objects.s3_bucket.get_key(self.object_key + "/gz/" + self.sha1) return k.generate_url(3600, "GET")
def Get_Object_Metatags(self, obj, bucket): """ Returns a dictionary of metatag keys and values for the given object _obj_: target object _bucket_: bucket containing object """ s3bucket = self._conn.get_bucket(bucket) try: s3obj = Key(s3bucket, obj) logger.debug("Read 1 byte of object %s" % obj) s3obj.read(size=1) except S3ResponseError, err: if err.status == 404: dirobj = obj + '/' logger.debug("Could not find %s, trying %s" % (obj, dirobj)) s3obj = Key(s3bucket, dirobj) logger.debug("Read 1 byte of object %s" % dirobj) s3obj.read(size=1)
def getChunkyKeyObj(self, chunk_size=512000): key_obj = Key(bucket) key_obj.key = self.row.file_key while True: chunk = key_obj.read(chunk_size) self.download_session_row.downloaded_size += len(chunk) self.download_session_row.save() if len(chunk) == 0: break yield chunk
def get_index(prefix): """ :param prefix: str Prefix to S3 bucket :return: Uncompressed warc index :rtype: str """ botokey = Key(DATASET, prefix + 'warc.paths.gz') return gzip.GzipFile(fileobj=StringIO(botokey.read())).read()
def get_index(self, prefix): """ :param prefix: str Prefix to S3 bucket :return: Uncompressed warc index :rtype: str """ crawl = self.select_crawl(prefix) botokey = Key(self.bucket, crawl + 'warc.paths.gz') return [i.strip() for i in GzipFile(fileobj=BytesIO(botokey.read()))]
def wait_for(self, bucket, key, timeout, start=None): if start is None: start = datetime.utcnow() log.info("Looking for key with last_modified greater than %s", start) for _ in hp.until(timeout=timeout, step=5): try: bucket_obj = self.get_bucket(bucket) except BadS3Bucket as error: log.error(error) continue if key == '/': log.info("The bucket exists! and that is all we are looking for") return k = Key(bucket_obj) k.key = key try: k.read() except boto.exception.S3ResponseError as error: if error.status == 404: log.info("Key doesn't exist yet\tbucket=%s\tkey=%s", bucket_obj.name, key) continue else: log.error(error) continue last_modified = k.last_modified log.info("Found key in the bucket\tbucket=%s\tkey=%s\tlast_modified=%s", bucket_obj.name, key, last_modified) date = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S GMT") if date > start: log.info("Found key and it's newer than our start time!") return else: log.info("Found key but it's older than our start time, hasn't been updated yet") raise BespinError("Couldn't find the s3 key with a newer last modified")
def generate_diff(self, original_file_info, latest_file_info): # self.set_file_contents("DIFF CONTENT") k1 = Key(FileInfo.objects.s3_bucket) k1.key = original_file_info.object_key + "/" + original_file_info.sha1 k2 = Key(FileInfo.objects.s3_bucket) k2.key = latest_file_info.object_key + "/" + latest_file_info.sha1 h1 = k1.read(100) h2 = k2.read(100) f1_page_size = int(h1[16:18].encode("hex"), 16) f2_page_size = int(h2[16:18].encode("hex"), 16) if f1_page_size != f2_page_size: return False f1_n_pages = int(h1[28:32].encode("hex"), 16) f2_n_pages = int(h2[28:32].encode("hex"), 16) f1_page1 = h1 + k1.read(f1_page_size - 100) f2_page1 = h2 + k2.read(f2_page_size - 100) diff_file_contents = "SQLITE DIFF FILE" diff_file_contents += original_file_info.sha1.decode("hex") diff_file_contents += latest_file_info.sha1.decode("hex") diff_file_contents += struct.pack(">L", f1_page_size) if f1_page1 != f2_page1: diff_file_contents += struct.pack(">L", 1) diff_file_contents += "!" diff_file_contents += f2_page1 for i in range(2, max(f1_n_pages, f2_n_pages) + 1): f1_page = None f2_page = None if i <= f1_n_pages: f1_page = k1.read(f1_page_size) if i <= f2_n_pages: f2_page = k2.read(f2_page_size) if f1_page == f2_page: continue if f1_page == None: diff_file_contents += struct.pack(">L", i) diff_file_contents += "+" diff_file_contents += f2_page continue if f2_page == None: diff_file_contents += struct.pack(">L", i) diff_file_contents += "-" continue diff_file_contents += struct.pack(">L", i) diff_file_contents += "!" diff_file_contents += f2_page self.set_file_contents(diff_file_contents) return True
def s3_download(output_file_path, s3_bucket, s3_access_key_id, s3_secret_key, s3_file_key=None, prefix=None): """ Downloads the file matching the provided key, in the provided bucket, from Amazon S3. If s3_file_key is none, it downloads the last file from the provided bucket with the .tbz extension, filtering by prefix if it is provided. """ bucket = s3_connect(s3_bucket, s3_access_key_id, s3_secret_key) if not s3_file_key: keys = s3_list(s3_bucket, s3_access_key_id, s3_secret_key, prefix) if not keys: raise Exception("Target S3 bucket is empty") s3_file_key = keys[-1] key = Key(bucket, s3_file_key) with open(output_file_path, "w+") as f: f.write(key.read())
def _do_retrieve(bucket_name, key_path, number_retries=DEFAULT_S3_RETRIES): """ Run-logic to do a data retrieval for a file in an S3 bucket.""" key = Key(_get_bucket(bucket_name), key_path) try: return key.read() except IncompleteRead: if number_retries > 0: print "s3_retreive failed with incomplete read, retrying on %s" % key_path return _do_retrieve(bucket_name, key_path, number_retries=number_retries - 1) raise except SSLError as e: if 'The read operation timed out' == e.message: print "s3_retreive failed with timeout, retrying on %s" % key_path return _do_retrieve(bucket_name, key_path, number_retries=number_retries - 1) raise
def register_page(): error = '' file_contents = '' try: username = request.form['user'] if (re.match(userreg,username)): bucket = conn.get_bucket(config.buck_name, validate=True) k = Key(bucket) k.key = 'auth_user.txt' k.open() file_contents = k.read() file_contents+=username key = bucket.new_key('auth_users.txt') key.set_contents_from_string(file_contents) key.set_acl('public-read') return 'Successfully Registered. Login.' else: return 'UserName:3-15 charecters consisting of letter or digits and optional -or_.' except Exception as e: return(str(e))
class S3File(File): def __init__(self, bucket, name): self._bucket = bucket self._name = name self._key = Key(bucket=bucket, name=name.encode('utf-8')) self._pos = 0 self._open = False self._fake_open = False self._mode = 'r' @property def name(self): return self._name @property def mode(self): return self._key.mode @property def closed(self): return self._fake_open def size(): doc = "The size property." def fget(self): raise NotImplementedError("S3File doesn't implement size and __len__") def fset(self, value): raise NotImplementedError("S3File doesn't implement size and __len__") return locals() def open(self, mode="r"): self.close() self._mode = (mode or 'r')[0] self._fake_open = True def close(self): if self._open: self._pos = 0 self._key.close() self._fake_open = False def seek(self, position): if position != 0: raise NotImplementedError("S3File doesn't implement seek at positions other than 0") if self._pos != 0: # TODO: This is a bit flakey I imagine self._key.resp = None self._pos = 0 def tell(self): return self._pos def read(self, num_bytes=None): if not self._open: self._key.open(self._mode) self._open = True data = self._key.read(num_bytes) self._pos += len(data) return data def write(self, content): raise NotImplementedError("S3File doesn't implement write") def flush(self): raise NotImplementedError("S3File doesn't implement flush") def close(self): self._key.close()
def valid_segments(self): kfile = Key(self.bucket, '/common-crawl/parse-output/valid_segments.txt') return [i.strip() for i in kfile.read().splitlines()]
gzipped = GzipFile(None, 'rb', fileobj=k) reader = csv.reader(io.TextIOWrapper(gzipped, newline="", encoding="utf-8"), delimiter='^') data = [] for id, line in enumerate(reader): data.append(line) pprint(line) if id > 10: break ptitle = kname #headers=['Col#%d' % i for i in range(len(data[0]))] #print (get_formatted(ptitle,data,headers,join = True)) if 0: buffer = io.BytesIO(k.read()) print(buffer) z = zipfile.ZipFile(buffer) foo2 = z.open(z.infolist()[0]) print(sys.getsizeof(foo2)) line_counter = 0 for _ in foo2: line_counter += 1 print(line_counter) z.close() if 0: #print k.read(10) gz_file = gzip.GzipFile(fileobj=k, mode='rb') reader = csv.ListReader(io.TextIOWrapper(gz_file, newline="",
s3 = boto3.client('s3', region_name=cred['AWS_DEFAULT_REGION']) s3.put_object(Bucket=datasetF, Key=self.name, Body=self.value) conn = boto.connect_s3(cred['AWS_DEFAULT_REGION']) bucket = conn.get_bucket(datasetF) for line in smart_open.smart_open('s3://mybucket/mykey.txt'): print line bucket = conn.get_bucket(baseUrl + folderUrl + fileUrl) k = Key(bucket) k.key = 'filename.txt' k.open() k.read(10) peopleDF = spark.read.json("examples/src/main/resources/people.json") # DataFrames can be saved as Parquet files, maintaining the schema information. peopleDF.write.parquet("people.parquet") # Read in the Parquet file created above. # Parquet files are self-describing so the schema is preserved. # The result of loading a parquet file is also a DataFrame. parquetFile = spark.read.parquet("people.parquet") # Parquet files can also be used to create a temporary view and then used in SQL statements. parquetFile.createOrReplaceTempView("parquetFile") teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
conn=S3Connection(login[1],login[2]) mybucket = conn.get_bucket('ec2dev') print "Connected to S3" except: print "Unable to connect to S3" exit() try: for j in mybucket.list(): if j.name == 'login.txt': print j.name k = Key(mybucket) k.key = j.name k.open() file_1 = k.read() print "Successfully opened login.txt" except: print "Unable to open File on S3" exit() login = file_1.split() try: conn = pymysql.connect(host=login[0],user= login[1],password=login[2],db= login[4]) print "Connected successfully to RDS" except: print "Unable to connect to RDS" exit() cur = conn.cursor()
def read(self): k = Key(FileInfo.objects.s3_bucket) k.key = self.object_key + "/" + self.sha1 return k.read()
class S3FunctionalityTest(): ''' Functionality Test of an S3 Bucket Only works with Keystone Auth URL v3 ''' options = dict() def __init__(self, options): # First we try to list the ec2 credentials try: res = json.loads( subprocess.check_output([ "openstack", "--os-auth-url", options.auth_url, "--os-username", options.username, "--os-password", options.password, "--os-project-name", options.tenant, "--os-project-domain-name", DEFAULT_DOMAIN_NAME, "--os-user-domain-name", DEFAULT_DOMAIN_NAME, "--os-identity-api-version", "3", "ec2", "credentials", "list", "-f", "json" ])) res[0]['Access'] # If they don't exist we create some except: try: subprocess.check_output([ "openstack", "--os-auth-url", options.auth_url, "--os-username", options.username, "--os-password", options.password, "--os-project-name", options.tenant, "--os-project-domain-name", DEFAULT_DOMAIN_NAME, "--os-user-domain-name", DEFAULT_DOMAIN_NAME, "--os-identity-api-version", "3", "ec2", "credentials", "create" ], stderr=subprocess.STDOUT) except: print "Could not create EC2 credentials" sys.exit(NAGIOS_STATE_UNKNOWN) res = json.loads( subprocess.check_output([ "openstack", "--os-auth-url", options.auth_url, "--os-username", options.username, "--os-password", options.password, "--os-project-name", options.tenant, "--os-project-domain-name", DEFAULT_DOMAIN_NAME, "--os-user-domain-name", DEFAULT_DOMAIN_NAME, "--os-identity-api-version", "3", "ec2", "credentials", "list", "-f", "json" ])) if LOCAL_DEBUG: print res _access_key = res[0]['Access'] _secret_key = res[0]['Secret'] _s3_host = options.s3_host self.conn = S3Connection(aws_access_key_id=_access_key, aws_secret_access_key=_secret_key, host=_s3_host) try: self.b = self.conn.get_bucket(DEFAULT_BUCKET_NAME) except: self.b = self.conn.create_bucket(DEFAULT_BUCKET_NAME) self.k = Key(self.b) self.k.key = 'nagiostest3' def s3_create_bucket(self): """ create a bucket, does not fail if it exists """ self.conn.create_bucket(DEFAULT_BUCKET_NAME) def s3_store_data(self): """ store a 3MB object in the bucket """ USERHOMEDIR = os.path.expanduser('~') TESTFILEPATH = "%s/3MBFILE" % USERHOMEDIR if not os.path.exists(TESTFILEPATH): with open(TESTFILEPATH, "wb") as out: out.truncate(1024 * 1024 * 3) self.k.set_contents_from_filename(TESTFILEPATH) def s3_read_data(self): """ read object from bucket """ self.k.open() self.k.read() def s3_delete_data(self): """ delete object from bucket """ self.k.delete() def execute(self): results = dict() try: self.s3_create_bucket() self.s3_store_data() self.s3_read_data() self.s3_delete_data() except: raise return results