Пример #1
0
    def test_read_gunzip_file(self):
        bucket, myfile = s3grep._parse_url(TestBotoStream.url)
        resource = boto3.resource('s3')
        obj3 = resource.Object(bucket, myfile)
        datadict = obj3.get()
        buffr = io.BufferedReader(boto_stream.BotoStreamBody(datadict['Body']))
        reader = io.TextIOWrapper(gzip.GzipFile(fileobj=buffr, mode='rb'))

        # check the first line
        self.assertEqual(next(reader),
                         "common-crawl/crawl-data/CC-MAIN-2015-40/segments/"
                         "1443736672328.14/\n")
Пример #2
0
    def test_grep_a_file(self):
        bucket, myfile = s3grep._parse_url(TestS3Grep.url)

        output = io.StringIO()
        s3grep._grep_a_file(bucketstr=bucket, key=myfile,
                            regex=r'.*1443737929054.*', output=output)

        self.assertEqual(output.getvalue(),
                         "common-crawl/crawl-data/CC-MAIN-2015-40/"
                         "segment.paths.gz:"
                         "common-crawl/crawl-data/CC-MAIN-2015-40/segments/"
                         "1443737929054.69/\n")
Пример #3
0
    def test_read_full_binary_file(self):
        bucket, myfile = s3grep._parse_url(TestBotoStream.url)
        resource = boto3.resource('s3')
        obj = resource.Object(bucket, myfile)

        datadict = obj.get()
        botostream = boto_stream.BotoStreamBody(body=datadict['Body'])

        reader = io.BufferedReader(botostream)

        # use the regular boto3 api
        with tempfile.NamedTemporaryFile('wb') as tfile:
            obj2 = resource.Object(bucket, myfile)
            obj2.download_file(tfile.name)

            with open(tfile.name, 'rb') as rtfile:
                self.assertEqual(rtfile.read(), reader.read())
Пример #4
0
    def test_parse_url(self):
        bucket, myflie = s3grep._parse_url("s3://mybucket/myfile")

        self.assertEqual(bucket, "mybucket")
        self.assertEqual(myflie, "myfile")