Пример #1
0
 def post(self, *args):
     # Add entry into bucket and flag as multipart upload
     if self.bucket_name and self.object_name:
         bucket_name = self.bucket_name
         object_name = self.object_name
     else:
         bucket_name,object_name = args
     
     if bucket_name not in self._get_bucket_names():
         self._error(code=404,s3code='NSB')
         return
     
     original_name = urllib.unquote(object_name)
     bucket_object = Binary(self.request.body)
     object_size = bucket_object.__len__()
     object_md5 = self._object_md5(bucket_object)
     
     if self.uploadId:
         # We have a multipart upload, so iterate over the parts to generate the md5 hash and calculate size
         # This is the last call made after the mutlipart upload with the uploadId
         mupmd5 = hashlib.md5()
         mupsize = 0
         for mup in self.application.S3[bucket_name].find({'object_name':object_name}):
             mupmd5.update(mup['object'])
             mupsize += mup['size']
             
         self.application.S3[bucket_name].insert_one({'object_name':object_name,'object':bucket_object,'multipart':True,'md5':mupmd5.hexdigest(),'size':mupsize,'added':datetime.datetime.utcnow(),'updated':datetime.datetime.utcnow(),})
     
     self.render_xml({"InitiateMultipartUploadResult": {
         "Bucket": bucket_name,
         "Prefix": self.prefix,
         "Key":object_name,
         "UploadId":object_name
     }})
Пример #2
0
 def test_binary(self):
     a_string = "hello world"
     a_binary = Binary(b("hello world"))
     self.assertTrue(a_binary.startswith(b("hello")))
     self.assertTrue(a_binary.endswith(b("world")))
     self.assertTrue(isinstance(a_binary, Binary))
     self.assertFalse(isinstance(a_string, Binary))
Пример #3
0
    def put(self, *args):
        
        if self.bucket_name and self.object_name:
            bucket_name = self.bucket_name
            object_name = self.object_name
        else:
            bucket_name,object_name = args
            
        original_name = urllib.unquote(object_name)

        if bucket_name not in self._get_bucket_names():
            self._error(code=404,s3code='NSB')
            return
        
        # Insert object and then calculate computed md5 of stored object, size, then update and return
        
        # If the object already exists, delete contents and add updated timestamp and update
        existance = self.application.S3[bucket_name].find({"object_name":original_name})       
        
        if existance.count() > 0 and self.partNumber == None:
            existance_id = existance.next()['_id']
            update_object = Binary(self.request.body)
            object_size = update_object.__len__()
            object_md5 = self._object_md5(update_object)
            self.application.S3[bucket_name].update({"_id":existance_id},{'$set': {'object':update_object,'md5':object_md5,'updated':datetime.datetime.utcnow(),'size':object_size}})
            self.set_header('etag', '"%s"' % object_md5)
            self.finish()
            return
        
        if self.partNumber:
            tobeinserted = {'object_name':original_name,'object':Binary(self.request.body),'partNumber':self.partNumber}
        else:
            tobeinserted = {'object_name':original_name,'object':Binary(self.request.body)}
            
        inserted_object_id = self.application.S3[bucket_name].insert_one(tobeinserted).inserted_id
        inserted_object = self._get_bucket_object(bucket_name=bucket_name,_id=inserted_object_id)
        
        object_size = inserted_object['object'].__len__()
        object_md5 = self._object_md5(inserted_object['object'])
        self.application.S3[bucket_name].update({'_id':inserted_object_id},{'$set': {'md5':object_md5,'updated':datetime.datetime.utcnow(),'added':datetime.datetime.utcnow(),'size':object_size}})
        self.set_header('etag', '"%s"' % object_md5)
        
        
        self.finish()
Пример #4
0
    def test_cursor(self):
        db = self.db

        db.drop_collection("test")
        docs = [{
            'foo': [1, 2]
        }, {
            'bar': {
                'hello': 'world'
            }
        }, {
            'code': Code("function x() { return 1; }")
        }, {
            'bin': Binary(b("\x00\x01\x02\x03\x04"))
        }, {
            'dbref': {
                '_ref': DBRef('simple', ObjectId('509b8db456c02c5ab7e63c34'))
            }
        }]

        db.test.insert(docs)
        reloaded_docs = json_util.loads(json_util.dumps(db.test.find()))
        for doc in docs:
            self.assertTrue(doc in reloaded_docs)
Пример #5
0
 def forwards(self):
     phash = {}
     db = get_db()
     metrics = db.noc.ts.metrics
     bulk = metrics.initialize_unordered_bulk_op()
     n = 0
     for m in metrics.find({}).sort("name", 1):
         phash[m["name"]] = m["hash"]
         if "." in m["name"]:
             pn = ".".join(m["name"].split(".")[:-1])
             parent = phash[pn]
         else:
             parent = Binary("\x00" * 8)
         bulk.find({
             "_id": m["_id"]
         }).update({
             "$set": {
                 "local": m["name"].split(".")[-1],
                 "parent": parent
             }
         })
         n += 1
     if n:
         bulk.execute()
Пример #6
0
    def __flush_data(self, data):
        """Flush `data` to a chunk.
        """
        # Ensure the index, even if there's nothing to write, so
        # the filemd5 command always succeeds.
        self.__ensure_indexes()
        self._file['md5'].update(data)

        if not data:
            return
        assert (len(data) <= self.chunk_size)

        chunk = {
            "files_id": self._file["_id"],
            "n": self._chunk_number,
            "data": Binary(data)
        }

        try:
            self._chunks.insert_one(chunk)
        except DuplicateKeyError:
            self._raise_file_exists(self._file['_id'])
        self._chunk_number += 1
        self._position += len(data)
Пример #7
0
    def write(self, arctic_lib, version, symbol, item, previous_version):
        try:
            # If it's encodeable, then ship it
            b = bson.BSON.encode({'data': item})
            if len(b) < _MAX_BSON_ENCODE:
                version['data'] = item
                return
        except InvalidDocument:
            pass

        # Pickle, chunk and store the data
        collection = arctic_lib.get_top_level_collection()
        # Try to pickle it. This is best effort
        version['blob'] = _MAGIC_CHUNKEDV2
        pickled = cPickle.dumps(item, protocol=cPickle.HIGHEST_PROTOCOL)

        data = compress_array([
            pickled[i * _CHUNK_SIZE:(i + 1) * _CHUNK_SIZE]
            for i in xrange(int(len(pickled) / _CHUNK_SIZE + 1))
        ])

        for seg, d in enumerate(data):
            segment = {'data': Binary(d)}
            segment['segment'] = seg
            seg += 1
            sha = checksum(symbol, segment)
            collection.update_one({
                'symbol': symbol,
                'sha': sha
            }, {
                '$set': segment,
                '$addToSet': {
                    'parent': version['_id']
                }
            },
                                  upsert=True)
Пример #8
0
def binarize_image(image):
    from PIL import Image

    binary = None

    try:
        im = Image.open(image)
        thumb = im.copy()
        thumb.thumbnail((260, 260))
        image_buffer = StringIO()
        thumb.save(image_buffer, "JPEG")
        binary = Binary(image_buffer.getvalue(), BINARY_SUBTYPE)
    except IOError as e:
        logging.error("failed to binarize image: " + str(e))
        return None

    # if image is a file object, rewind it
    finally:
        try:
            image.seek(0)
        except AttributeError:
            pass

    return binary
Пример #9
0
def test_read_with_base_version_id():
    self = create_autospec(PickleStore)
    version = {
        '_id': sentinel._id,
        'base_version_id': sentinel.base_version_id,
        'blob': '__chunked__'
    }
    coll = Mock()
    arctic_lib = Mock()
    coll.find.return_value = [{
        'data': Binary(compressHC(cPickle.dumps(object))),
        'symbol': 'sentinel.symbol',
        'segment': 1
    }]
    arctic_lib.get_top_level_collection.return_value = coll

    assert PickleStore.read(self, arctic_lib, version,
                            sentinel.symbol) == object
    assert coll.find.call_args_list == [
        call({
            'symbol': sentinel.symbol,
            'parent': sentinel.base_version_id
        })
    ]
Пример #10
0
def upload_file():

    uploaded_file = request.files['file']
    app_path = "./static/uploads/"
    app_name = uploaded_file.filename

    if uploaded_file.filename != '':
        file_path = os.path.join(app_path, app_name)
        uploaded_file.save(file_path)

    format_status = check_format(uploaded_file, app_path, app_name, file_path)

    if format_status == True:
        fs, coll = create_connection_mongo_cloud()
        with open(file_path, "rb") as fp:
            encoded = Binary(fp.read())
        flink = fs.put(encoded, filename=app_name)
        coll.insert_one({"filename": app_name, "file": flink})

        os.remove(file_path)
        return json.dumps({'status': 'Zip uploaded successfully'}), 200
    else:
        remove_file_uploads(app_path, app_name, file_path)
        return format_status
Пример #11
0
    for entry in dbx.files_list_folder(sys.argv[2]).entries:
        try:
            meta = dbx.files_alpha_get_metadata(sys.argv[2] + entry.name)
        except db.exceptions.ApiError:
            print("not found !")
        names.append(entry.name)
        if (meta.size <= 8 * 1024**6) and (entry.name.split('.')[-1] in {
                "pdf", "docx", "doc", "ppt", "pptx", "jpeg", "jpg", "png"
        }):
            try:
                m, r = dbx.files_download(sys.argv[2] + entry.name)
            except db.exceptions.ApiError:
                print("download failure !")
            try:
                filestore.update({"name": entry.name}, {
                    "$set": {
                        "name": entry.name,
                        "file": Binary(r.content),
                        "meta": str(m)
                    }
                },
                                 upsert=True)
            except pymongo.errors.OperationFailure:
                print("Update failed !")
    print(filestore.count())

    removefromdb()

    #time.sleep( 1 )
Пример #12
0
    def append(self, symbol, item):
        """
        Appends data from item to symbol's data in the database

        Parameters
        ----------
        symbol: str
            the symbol for the given item in the DB
        item:
            the data to append
        """

        sym = self._get_symbol_info(symbol)
        if not sym:
            raise NoDataFoundException("Symbol does not exist. Cannot append")

        if isinstance(item, Series) and sym['type'] == 'df':
            raise Exception("cannot append a series to a dataframe")
        if isinstance(item, DataFrame) and sym['type'] == 'series':
            raise Exception("cannot append a dataframe to a series")

        records = []
        ranges = []
        dtype = None

        for start, end, record in self.chunker.to_chunks(
                item, sym['chunk_size']):
            '''
            if we have a multiindex there is a chance that part of the append
            will overlap an already written chunk, so we need to update
            where the date part of the index overlaps
            '''
            if item.index.nlevels > 1:
                df = self.read(symbol,
                               chunk_range=self.chunker.to_range(start, end))
                if not df.empty:
                    if df.equals(record):
                        continue
                    record = record.combine_first(df)
                    self.update(symbol, record)
                    sym = self._get_symbol_info(symbol)
                    continue
            r, dtype = serialize(record, string_max_len=self.STRING_MAX)
            records.append(r)
            ranges.append((start, end))

        if len(records) > 0:
            item = np.array([r for record in records
                             for r in record]).flatten()

            if sym.get('shape', [-1]) != [
                    -1,
            ] + list(item.shape)[1:]:
                raise UnhandledDtypeException()

            item = item.astype(dtype)

            if str(dtype) != sym['dtype']:
                raise Exception("Dtype mismatch - cannot append")

            data = item.tostring()
            sym['len'] += len(item)
            if len(item) > 0:
                sym['chunk_count'] += len(records)
                sym['append_count'] += len(records)
                sym['append_size'] += len(data)

            chunks = [r.tostring() for r in records]
            chunks = compress_array(chunks)

            for chunk, rng in zip(chunks, ranges):
                start = rng[0]
                end = rng[-1]

                segment = {'data': Binary(chunk)}
                segment['start'] = start
                segment['end'] = end
                self._collection.update_one(
                    {
                        'symbol': symbol,
                        'sha': checksum(symbol, segment)
                    }, {'$set': segment},
                    upsert=True)

            self._symbols.replace_one({'symbol': symbol}, sym)
Пример #13
0
def _encode_uuid(name, value, dummy, opts):
    """Encode uuid.UUID."""
    uuid_representation = opts.uuid_representation
    binval = Binary.from_uuid(value, uuid_representation=uuid_representation)
    return _encode_binary(name, binval, dummy, opts)
Пример #14
0
    @client_context.require_test_commands
    def run_scenario(self):
        self.run_scenario(scenario_def, test)

    return run_scenario


test_creator = TestCreator(create_test, TestSpec, SPEC_PATH)
test_creator.create_tests()

# Prose Tests
LOCAL_MASTER_KEY = base64.b64decode(
    b'Mng0NCt4ZHVUYUJCa1kxNkVyNUR1QURhZ2h2UzR2d2RrZzh0cFBwM3R6NmdWMDFBMUN3YkQ'
    b'5aXRRMkhGRGdQV09wOGVNYUMxT2k3NjZKelhaQmRCZGJkTXVyZG9uSjFk')

LOCAL_KEY_ID = Binary(base64.b64decode(b'LOCALAAAAAAAAAAAAAAAAA=='),
                      UUID_SUBTYPE)
AWS_KEY_ID = Binary(base64.b64decode(b'AWSAAAAAAAAAAAAAAAAAAA=='),
                    UUID_SUBTYPE)


def create_with_schema(coll, json_schema):
    """Create and return a Collection with a jsonSchema."""
    coll.with_options(write_concern=WriteConcern(w='majority')).drop()
    return coll.database.create_collection(
        coll.name, validator={'$jsonSchema': json_schema}, codec_options=OPTS)


def create_key_vault(vault, *data_keys):
    """Create the key vault collection with optional data keys."""
    vault = vault.with_options(write_concern=WriteConcern(w='majority'),
                               codec_options=OPTS)
Пример #15
0
 def _encode_task(self, task):
     for prefix in self.ENCODE_FIELDS_PREFIX:
         for k in list(task.keys()):
             if k.startswith(prefix):
                 task[k] = Binary(pickle.dumps(task[k]))
     return task
Пример #16
0
def _authenticate_scram(credentials, sock_info, mechanism):
    """Authenticate using SCRAM."""

    username = credentials.username
    if mechanism == 'SCRAM-SHA-256':
        digest = "sha256"
        digestmod = hashlib.sha256
        data = saslprep(credentials.password).encode("utf-8")
    else:
        digest = "sha1"
        digestmod = hashlib.sha1
        data = _password_digest(username, credentials.password).encode("utf-8")
    source = credentials.source
    cache = credentials.cache

    # Make local
    _hmac = hmac.HMAC

    user = username.encode("utf-8").replace(b"=", b"=3D").replace(b",", b"=2C")
    nonce = standard_b64encode(os.urandom(32))
    first_bare = b"n=" + user + b",r=" + nonce

    cmd = SON([('saslStart', 1),
               ('mechanism', mechanism),
               ('payload', Binary(b"n,," + first_bare)),
               ('autoAuthorize', 1)])
    res = sock_info.command(source, cmd)

    server_first = res['payload']
    parsed = _parse_scram_response(server_first)
    iterations = int(parsed[b'i'])
    if iterations < 4096:
        raise OperationFailure("Server returned an invalid iteration count.")
    salt = parsed[b's']
    rnonce = parsed[b'r']
    if not rnonce.startswith(nonce):
        raise OperationFailure("Server returned an invalid nonce.")

    without_proof = b"c=biws,r=" + rnonce
    if cache.data:
        client_key, server_key, csalt, citerations = cache.data
    else:
        client_key, server_key, csalt, citerations = None, None, None, None

    # Salt and / or iterations could change for a number of different
    # reasons. Either changing invalidates the cache.
    if not client_key or salt != csalt or iterations != citerations:
        salted_pass = _hi(
            digest, data, standard_b64decode(salt), iterations)
        client_key = _hmac(salted_pass, b"Client Key", digestmod).digest()
        server_key = _hmac(salted_pass, b"Server Key", digestmod).digest()
        cache.data = (client_key, server_key, salt, iterations)
    stored_key = digestmod(client_key).digest()
    auth_msg = b",".join((first_bare, server_first, without_proof))
    client_sig = _hmac(stored_key, auth_msg, digestmod).digest()
    client_proof = b"p=" + standard_b64encode(_xor(client_key, client_sig))
    client_final = b",".join((without_proof, client_proof))

    server_sig = standard_b64encode(
        _hmac(server_key, auth_msg, digestmod).digest())

    cmd = SON([('saslContinue', 1),
               ('conversationId', res['conversationId']),
               ('payload', Binary(client_final))])
    res = sock_info.command(source, cmd)

    parsed = _parse_scram_response(res['payload'])
    if not compare_digest(parsed[b'v'], server_sig):
        raise OperationFailure("Server returned an invalid signature.")

    # Depending on how it's configured, Cyrus SASL (which the server uses)
    # requires a third empty challenge.
    if not res['done']:
        cmd = SON([('saslContinue', 1),
                   ('conversationId', res['conversationId']),
                   ('payload', Binary(b''))])
        res = sock_info.command(source, cmd)
        if not res['done']:
            raise OperationFailure('SASL conversation failed to complete.')
Пример #17
0
def default(obj, json_options=DEFAULT_JSON_OPTIONS):
    # We preserve key order when rendering SON, DBRef, etc. as JSON by
    # returning a SON for those types instead of a dict.
    if isinstance(obj, ObjectId):
        return {"$oid": str(obj)}
    if isinstance(obj, DBRef):
        return _json_convert(obj.as_doc(), json_options=json_options)
    if isinstance(obj, datetime.datetime):
        if (json_options.datetime_representation ==
                DatetimeRepresentation.ISO8601):
            if not obj.tzinfo:
                obj = obj.replace(tzinfo=utc)
            if obj >= EPOCH_AWARE:
                off = obj.tzinfo.utcoffset(obj)
                if (off.days, off.seconds, off.microseconds) == (0, 0, 0):
                    tz_string = 'Z'
                else:
                    tz_string = obj.strftime('%z')
                millis = int(obj.microsecond / 1000)
                fracsecs = ".%03d" % (millis, ) if millis else ""
                return {
                    "$date":
                    "%s%s%s" %
                    (obj.strftime("%Y-%m-%dT%H:%M:%S"), fracsecs, tz_string)
                }

        millis = bson._datetime_to_millis(obj)
        if (json_options.datetime_representation ==
                DatetimeRepresentation.LEGACY):
            return {"$date": millis}
        return {"$date": {"$numberLong": str(millis)}}
    if json_options.strict_number_long and isinstance(obj, Int64):
        return {"$numberLong": str(obj)}
    if isinstance(obj, (RE_TYPE, Regex)):
        flags = ""
        if obj.flags & re.IGNORECASE:
            flags += "i"
        if obj.flags & re.LOCALE:
            flags += "l"
        if obj.flags & re.MULTILINE:
            flags += "m"
        if obj.flags & re.DOTALL:
            flags += "s"
        if obj.flags & re.UNICODE:
            flags += "u"
        if obj.flags & re.VERBOSE:
            flags += "x"
        if isinstance(obj.pattern, text_type):
            pattern = obj.pattern
        else:
            pattern = obj.pattern.decode('utf-8')
        if json_options.json_mode == JSONMode.LEGACY:
            return SON([("$regex", pattern), ("$options", flags)])
        return {
            '$regularExpression': SON([("pattern", pattern),
                                       ("options", flags)])
        }
    if isinstance(obj, MinKey):
        return {"$minKey": 1}
    if isinstance(obj, MaxKey):
        return {"$maxKey": 1}
    if isinstance(obj, Timestamp):
        return {"$timestamp": SON([("t", obj.time), ("i", obj.inc)])}
    if isinstance(obj, Code):
        if obj.scope is None:
            return {'$code': str(obj)}
        return SON([('$code', str(obj)),
                    ('$scope', _json_convert(obj.scope, json_options))])
    if isinstance(obj, Binary):
        return _encode_binary(obj, obj.subtype, json_options)
    if PY3 and isinstance(obj, bytes):
        return _encode_binary(obj, 0, json_options)
    if isinstance(obj, uuid.UUID):
        if json_options.strict_uuid:
            binval = Binary.from_uuid(
                obj, uuid_representation=json_options.uuid_representation)
            return _encode_binary(binval, binval.subtype, json_options)
        else:
            return {"$uuid": obj.hex}
    if isinstance(obj, Decimal128):
        return {"$numberDecimal": str(obj)}
    if isinstance(obj, bool):
        return obj
    if (json_options.json_mode == JSONMode.CANONICAL
            and isinstance(obj, integer_types)):
        if -2**31 <= obj < 2**31:
            return {'$numberInt': text_type(obj)}
        return {'$numberLong': text_type(obj)}
    if json_options.json_mode != JSONMode.LEGACY and isinstance(obj, float):
        if math.isnan(obj):
            return {'$numberDouble': 'NaN'}
        elif math.isinf(obj):
            representation = 'Infinity' if obj > 0 else '-Infinity'
            return {'$numberDouble': representation}
        elif json_options.json_mode == JSONMode.CANONICAL:
            # repr() will return the shortest string guaranteed to produce the
            # original value, when float() is called on it. str produces a
            # shorter string in Python 2.
            return {'$numberDouble': text_type(repr(obj))}
    raise TypeError("%r is not JSON serializable" % obj)
    def run_operation(self, sessions, collection, operation):
        original_collection = collection
        name = camel_to_snake(operation['name'])
        if name == 'run_command':
            name = 'command'
        elif name == 'download_by_name':
            name = 'open_download_stream_by_name'
        elif name == 'download':
            name = 'open_download_stream'

        def parse_options(opts):
            if 'readPreference' in opts:
                opts['read_preference'] = parse_read_preference(
                    opts.pop('readPreference'))

            if 'writeConcern' in opts:
                opts['write_concern'] = WriteConcern(
                    **dict(opts.pop('writeConcern')))

            if 'readConcern' in opts:
                opts['read_concern'] = ReadConcern(
                    **dict(opts.pop('readConcern')))
            return opts

        database = collection.database
        collection = database.get_collection(collection.name)
        if 'collectionOptions' in operation:
            collection = collection.with_options(
                **dict(parse_options(operation['collectionOptions'])))

        object_name = self.get_object_name(operation)
        if object_name == 'gridfsbucket':
            # Only create the GridFSBucket when we need it (for the gridfs
            # retryable reads tests).
            obj = GridFSBucket(
                database, bucket_name=collection.name,
                disable_md5=True)
        else:
            objects = {
                'client': database.client,
                'database': database,
                'collection': collection,
                'testRunner': self
            }
            objects.update(sessions)
            obj = objects[object_name]

        # Combine arguments with options and handle special cases.
        arguments = operation.get('arguments', {})
        arguments.update(arguments.pop("options", {}))
        parse_options(arguments)

        cmd = getattr(obj, name)

        for arg_name in list(arguments):
            c2s = camel_to_snake(arg_name)
            # PyMongo accepts sort as list of tuples.
            if arg_name == "sort":
                sort_dict = arguments[arg_name]
                arguments[arg_name] = list(iteritems(sort_dict))
            # Named "key" instead not fieldName.
            if arg_name == "fieldName":
                arguments["key"] = arguments.pop(arg_name)
            # Aggregate uses "batchSize", while find uses batch_size.
            elif arg_name == "batchSize" and name == "aggregate":
                continue
            # Requires boolean returnDocument.
            elif arg_name == "returnDocument":
                arguments[c2s] = arguments.pop(arg_name) == "After"
            elif c2s == "requests":
                # Parse each request into a bulk write model.
                requests = []
                for request in arguments["requests"]:
                    bulk_model = camel_to_upper_camel(request["name"])
                    bulk_class = getattr(operations, bulk_model)
                    bulk_arguments = camel_to_snake_args(request["arguments"])
                    requests.append(bulk_class(**dict(bulk_arguments)))
                arguments["requests"] = requests
            elif arg_name == "session":
                arguments['session'] = sessions[arguments['session']]
            elif name == 'command' and arg_name == 'command':
                # Ensure the first key is the command name.
                ordered_command = SON([(operation['command_name'], 1)])
                ordered_command.update(arguments['command'])
                arguments['command'] = ordered_command
            elif name == 'open_download_stream' and arg_name == 'id':
                arguments['file_id'] = arguments.pop(arg_name)
            elif name == 'with_transaction' and arg_name == 'callback':
                callback_ops = arguments[arg_name]['operations']
                arguments['callback'] = lambda _: self.run_operations(
                    sessions, original_collection, copy.deepcopy(callback_ops),
                    in_with_transaction=True)
            else:
                arguments[c2s] = arguments.pop(arg_name)

        result = cmd(**dict(arguments))

        if name == "aggregate":
            if arguments["pipeline"] and "$out" in arguments["pipeline"][-1]:
                # Read from the primary to ensure causal consistency.
                out = collection.database.get_collection(
                    arguments["pipeline"][-1]["$out"],
                    read_preference=ReadPreference.PRIMARY)
                return out.find()
        if name == "map_reduce":
            if isinstance(result, dict) and 'results' in result:
                return result['results']
        if 'download' in name:
            result = Binary(result.read())

        if isinstance(result, Cursor) or isinstance(result, CommandCursor):
            return list(result)

        return result
Пример #19
0
def array_to_binary(x):
    '''Numpy array to bson binary'''
    return Binary(pickle.dumps(x))
Пример #20
0
    def post(self):
        from app.base.document import ImageDocument

        form = NodeAvatarSetForm(self.request.arguments)
        if not form.validate() or 'avatar' not in self.request.files:
            raise HTTPError(404)

        node_id = form.node_id.data
        x = form.x.data
        y = form.y.data
        w = form.w.data
        h = form.h.data
        target_width = form.target_width.data

        node = yield NodeDocument.find_one({'_id': ObjectId(node_id)})
        if not node:
            raise HTTPError(404)

        upload_file = self.request.files['avatar'][0]

        now = datetime.now()
        document = {
            'name':
            upload_file['filename'],
            'body':
            Binary(upload_file['body']),
            'content_type':
            upload_file['content_type'].split('/')[1].upper(),
            'uploader':
            DBRef(UserDocument.meta['collection'],
                  ObjectId(self.current_user['_id'])),
            'upload_time':
            now
        }

        image = Image.open(StringIO(upload_file['body']))

        if image.size[0] < target_width:
            target_width = image.size[0]

        scale = image.size[0] * 1.0 / target_width

        x = int(x * scale)
        y = int(y * scale)
        w = int(w * scale)
        h = int(h * scale)

        box = (x, y, x + w, y + h)
        image = image.crop(box)

        output = StringIO()
        image = image.resize((64, 64),
                             Image.ANTIALIAS).save(output,
                                                   document['content_type'],
                                                   quality=100)
        document.update({'thumbnail': Binary(output.getvalue())})
        output.close()

        yield NodeAvatarDocument.remove_one({
            'node':
            DBRef(NodeDocument.meta['collection'], ObjectId(node_id))
        })

        image_id = yield ImageDocument.insert(document)

        document = {
            'node':
            DBRef(NodeDocument.meta['collection'], ObjectId(node_id)),
            'image':
            DBRef(ImageDocument.meta['collection'], ObjectId(image_id)),
            'uploader':
            DBRef(UserDocument.meta['collection'],
                  ObjectId(self.current_user['_id'])),
            'upload_time':
            now
        }
        yield NodeAvatarDocument.insert(document)

        self.finish()
Пример #21
0
def _element_to_bson(key, value, check_keys, uuid_subtype):
    if not isinstance(key, basestring):
        raise InvalidDocument("documents must have only string keys, "
                              "key was %r" % key)

    if check_keys:
        if key.startswith("$"):
            raise InvalidDocument("key %r must not start with '$'" % key)
        if "." in key:
            raise InvalidDocument("key %r must not contain '.'" % key)

    name = _make_c_string(key, True)
    if isinstance(value, float):
        return BSONNUM + name + struct.pack("<d", value)

    if _use_uuid:
        if isinstance(value, uuid.UUID):
            # Java Legacy
            if uuid_subtype == JAVA_LEGACY:
                # Python 3.0(.1) returns a bytearray instance for bytes (3.1
                # and newer just return a bytes instance). Convert that to
                # binary_type (here and below) for compatibility.
                from_uuid = binary_type(value.bytes)
                as_legacy_java = from_uuid[0:8][::-1] + from_uuid[8:16][::-1]
                value = Binary(as_legacy_java, subtype=OLD_UUID_SUBTYPE)
            # C# legacy
            elif uuid_subtype == CSHARP_LEGACY:
                # Microsoft GUID representation.
                value = Binary(binary_type(value.bytes_le),
                               subtype=OLD_UUID_SUBTYPE)
            # Python
            else:
                value = Binary(binary_type(value.bytes), subtype=uuid_subtype)

    if isinstance(value, Binary):
        subtype = value.subtype
        if subtype == 2:
            value = struct.pack("<i", len(value)) + value
        return (BSONBIN + name +
                struct.pack("<i", len(value)) + b(chr(subtype)) + value)
    if isinstance(value, Code):
        cstring = _make_c_string(value)
        if not value.scope:
            length = struct.pack("<i", len(cstring))
            return BSONCOD + name + length + cstring
        scope = _dict_to_bson(value.scope, False, uuid_subtype, False)
        full_length = struct.pack("<i", 8 + len(cstring) + len(scope))
        length = struct.pack("<i", len(cstring))
        return BSONCWS + name + full_length + length + cstring + scope
    if isinstance(value, binary_type):
        if PY3:
            # Python3 special case. Store 'bytes' as BSON binary subtype 0.
            return (BSONBIN + name +
                    struct.pack("<i", len(value)) + ZERO + value)
        cstring = _make_c_string(value)
        length = struct.pack("<i", len(cstring))
        return BSONSTR + name + length + cstring
    if isinstance(value, unicode):
        cstring = _make_c_string(value)
        length = struct.pack("<i", len(cstring))
        return BSONSTR + name + length + cstring
    if isinstance(value, dict):
        return BSONOBJ + name + _dict_to_bson(value, check_keys, uuid_subtype, False)
    if isinstance(value, (list, tuple)):
        as_dict = SON(zip([str(i) for i in range(len(value))], value))
        return BSONARR + name + _dict_to_bson(as_dict, check_keys, uuid_subtype, False)
    if isinstance(value, ObjectId):
        return BSONOID + name + value.binary
    if value is True:
        return BSONBOO + name + ONE
    if value is False:
        return BSONBOO + name + ZERO
    if isinstance(value, int):
        # TODO this is an ugly way to check for this...
        if value > MAX_INT64 or value < MIN_INT64:
            raise OverflowError("BSON can only handle up to 8-byte ints")
        if value > MAX_INT32 or value < MIN_INT32:
            return BSONLON + name + struct.pack("<q", value)
        return BSONINT + name + struct.pack("<i", value)
    # 2to3 will convert long to int here since there is no long in python3.
    # That's OK. The previous if block will match instead.
    if isinstance(value, long):
        if value > MAX_INT64 or value < MIN_INT64:
            raise OverflowError("BSON can only handle up to 8-byte ints")
        return BSONLON + name + struct.pack("<q", value)
    if isinstance(value, datetime.datetime):
        if value.utcoffset() is not None:
            value = value - value.utcoffset()
        millis = int(calendar.timegm(value.timetuple()) * 1000 +
                     value.microsecond / 1000)
        return BSONDAT + name + struct.pack("<q", millis)
    if isinstance(value, Timestamp):
        time = struct.pack("<I", value.time)
        inc = struct.pack("<I", value.inc)
        return BSONTIM + name + inc + time
    if value is None:
        return BSONNUL + name
    if isinstance(value, (RE_TYPE, Regex)):
        pattern = value.pattern
        flags = ""
        if value.flags & re.IGNORECASE:
            flags += "i"
        if value.flags & re.LOCALE:
            flags += "l"
        if value.flags & re.MULTILINE:
            flags += "m"
        if value.flags & re.DOTALL:
            flags += "s"
        if value.flags & re.UNICODE:
            flags += "u"
        if value.flags & re.VERBOSE:
            flags += "x"
        return BSONRGX + name + _make_c_string(pattern, True) + \
            _make_c_string(flags)
    if isinstance(value, DBRef):
        return _element_to_bson(key, value.as_doc(), False, uuid_subtype)
    if isinstance(value, MinKey):
        return BSONMIN + name
    if isinstance(value, MaxKey):
        return BSONMAX + name

    raise InvalidDocument("cannot convert value of type %s to bson" %
                          type(value))
Пример #22
0
    def create_model(self, applicationName, model='default', query=None, project=None, labels={}, transduction={}, colors={}, drop=False):
        '''
        Creates a trained model by querying the corresponding collection and fitting
        the corresponding pipeline for the application. Clustering is also run and the
        resulting dendrogram and fitted is stored in the _models collection.
        '''

        query = from_str(query)
        project = from_str(project)


        # convert labels to tokens
        tokens = defaultdict(lambda : len(tokens))
        token_labels = {k: tokens[v] for k, v in transduction.items()}

        if self.verbose:
            print(f'Finding application <{applicationName}>', end='...', flush=True)

        application = self.db.applications_\
            .find_one({'_id': applicationName})

        if application:

            # pipelineName = application['pipeline']

            # if self.verbose:
            #     print(f'OK\nFinding pipeline <{pipelineName}>', end='...', flush=True)

            # pipeline = self.db.pipelines_\
            #     .find_one({'_id': pipelineName})

            collectionName = application['collection']
            collection = self.db[collectionName]

            if self.verbose:
                print(f'OK\nQuerying collection <{collectionName}> <{query}>', end='...', flush=True)

            X = list(collection.find(query or {}))


            if len(X):
                print(f'found {len(X)}...OK')

                y = [token_labels.get(xi['_id'], -1) for xi in X]
                index = [x['_id'] for x in X]

                if self.verbose:
                    print('Transforming data', end='...', flush=True)

                umap = UMAP()
                base_pipeline = pydoc.locate(application['pipeline'])
                pipeline = Pipeline(base_pipeline.steps + [('umap', umap)])\
                    .set_params(**application.get('params', {}))
                
                X_transform = pipeline.fit_transform(X, y)

                if self.verbose:
                    print('OK\nClustering data', end='...', flush=True)

                parents, costs = cluster(X_transform,
                                         connectivity=umap.graph_,
                                         linkage='ward')

                if self.verbose:
                    print('OK')


                obj_computed = {
                   'pipeline': Binary(pickle.dumps(pipeline)),
                   'parents': parents.tolist(),
                   'costs': costs.tolist(),
                   'instances': index,
                   'X': X_transform.tolist(),
                   'tokens': sorted(tokens, key=tokens.get),
                }

                if project and project != {}:
                    if self.verbose:
                        print(f'Projecting data for histograms {project}', end='...', flush=True)
                    data = collection.aggregate([
                        {'$match': {'_id': {'$in': index}}},
                        {'$project': project}
                    ])

                    df = pd.DataFrame(list(data))\
                        .set_index('_id')\
                        .loc[index]

                    obj_computed['hist'] = df.fillna(df.median(axis=0))\
                        .to_dict(orient='list')

                    if self.verbose:
                        print('OK')

                obj = {
                    '_id': {'application': applicationName,
                            'model': model},
                    'labels': labels,
                    'colors': colors,
                    'date': datetime.datetime.utcnow(),
                    'query': json.dumps(query, indent=2),
                    'project': json.dumps(project, indent=2),
                    'size': len(index),
                    '_id_computed': self.transduction_.put(bson.BSON.encode(obj_computed))
                }

                if drop:
                    for doc in self.db.transduction_.find({'_id': obj['_id']}):
                        self.transduction_.delete(doc['_id_computed'])
                        self.db.transduction_.delete_one({'_id': doc['_id']})

                self.db.transduction_\
                    .insert_one(obj)

                if self.verbose:
                    print('done.')

                obj.update(obj_computed)

                return obj
Пример #23
0
        def _capture_screenshot(self):
            # сраный selenium, как же я его ненавижу
            # боль
            #   страдания
            #       ноль документации

            img_byte_arr = BytesIO()

            url = 'http://%s:%s/' % (self._host["ip"], self._host["port"])
            self._host["data"]["screenshot"] = None

            self._logger.debug("Obtaining driver")

            driver = None
            img = None
            try:
                caps = dict(DesiredCapabilities.CHROME)

                caps['args'] = [
                    "--proxy-server",
                    "socks5://%s:9050" % cnf.stalker.proxy
                ]

                driver = webdriver.Remote(
                    command_executor="http://%s:4444/wd/hub" %
                    cnf.stalker.HTTP.screenshots.selenium,
                    desired_capabilities=caps)

                driver.set_window_size(cnf.stalker.HTTP.screenshots.width,
                                       cnf.stalker.HTTP.screenshots.height)
                driver.set_page_load_timeout(
                    cnf.stalker.HTTP.screenshots.load_timeout)
                driver.set_script_timeout(
                    cnf.stalker.HTTP.screenshots.script_timeout)

                self._logger.debug("Loading %s:%s", self._host['ip'],
                                   self._host['port'])

                driver.get(url)
                time.sleep(cnf.stalker.HTTP.screenshots.pause)
                driver.execute_script("window.scrollTo(0, 0);")
                img = driver.get_screenshot_as_png()
            except Exception as e:
                raise e
            finally:
                if driver:
                    driver.quit()

            self._logger.debug("Finished for %s:%s", self._host['ip'],
                               self._host['port'])

            with Image.open(BytesIO(img)) as img:
                img = img.crop((0, 0, cnf.stalker.HTTP.screenshots.width,
                                cnf.stalker.HTTP.screenshots.height))
                extrema = img.convert("L").getextrema()
                if not extrema == (0, 0):
                    img.save(img_byte_arr, format='PNG')

                    self._host["data"]["screenshot"] = Binary(
                        zlib.compress(img_byte_arr.getvalue()))
                    img_byte_arr.close()
                    self._logger.debug(
                        "Saved screen of %s:%s (e: %s)" %
                        (self._host["ip"], self._host["port"], extrema))
                else:
                    self._logger.debug(
                        "Not saving screen of %s:%s, as it is empty",
                        self._host["ip"], self._host["port"])
Пример #24
0
def _element_to_bson(key, value, check_keys, uuid_subtype):
    if not isinstance(key, basestring):
        raise InvalidDocument("documents must have only string keys, "
                              "key was %r" % key)

    if check_keys:
        if key.startswith("$"):
            raise InvalidDocument("key %r must not start with '$'" % key)
        if "." in key:
            raise InvalidDocument("key %r must not contain '.'" % key)

    name = _make_c_string(key, True)
    if isinstance(value, float):
        return "\x01" + name + struct.pack("<d", value)

    # Use Binary w/ subtype 3 for UUID instances
    if _use_uuid:
        if isinstance(value, uuid.UUID):
            value = Binary(value.bytes, subtype=uuid_subtype)

    if isinstance(value, Binary):
        subtype = value.subtype
        if subtype == 2:
            value = struct.pack("<i", len(value)) + value
        return "\x05%s%s%s%s" % (name, struct.pack("<i", len(value)),
                                 chr(subtype), value)
    if isinstance(value, Code):
        cstring = _make_c_string(value)
        if not value.scope:
            length = struct.pack("<i", len(cstring))
            return "\x0D" + name + length + cstring
        scope = _dict_to_bson(value.scope, False, uuid_subtype, False)
        full_length = struct.pack("<i", 8 + len(cstring) + len(scope))
        length = struct.pack("<i", len(cstring))
        return "\x0F" + name + full_length + length + cstring + scope
    if isinstance(value, str):
        cstring = _make_c_string(value)
        length = struct.pack("<i", len(cstring))
        return "\x02" + name + length + cstring
    if isinstance(value, unicode):
        cstring = _make_c_string(value)
        length = struct.pack("<i", len(cstring))
        return "\x02" + name + length + cstring
    if isinstance(value, dict):
        return "\x03" + name + _dict_to_bson(value, check_keys, uuid_subtype, False)
    if isinstance(value, (list, tuple)):
        as_dict = SON(zip([str(i) for i in range(len(value))], value))
        return "\x04" + name + _dict_to_bson(as_dict, check_keys, uuid_subtype, False)
    if isinstance(value, ObjectId):
        return "\x07" + name + value.binary
    if value is True:
        return "\x08" + name + "\x01"
    if value is False:
        return "\x08" + name + "\x00"
    if isinstance(value, int):
        # TODO this is an ugly way to check for this...
        if value > MAX_INT64 or value < MIN_INT64:
            raise OverflowError("BSON can only handle up to 8-byte ints")
        if value > MAX_INT32 or value < MIN_INT32:
            return "\x12" + name + struct.pack("<q", value)
        return "\x10" + name + struct.pack("<i", value)
    if isinstance(value, long):
        # XXX No long type in Python 3
        if value > MAX_INT64 or value < MIN_INT64:
            raise OverflowError("BSON can only handle up to 8-byte ints")
        return "\x12" + name + struct.pack("<q", value)
    if isinstance(value, datetime.datetime):
        if value.utcoffset() is not None:
            value = value - value.utcoffset()
        millis = int(calendar.timegm(value.timetuple()) * 1000 +
                     value.microsecond / 1000)
        return "\x09" + name + struct.pack("<q", millis)
    if isinstance(value, Timestamp):
        time = struct.pack("<I", value.time)
        inc = struct.pack("<I", value.inc)
        return "\x11" + name + inc + time
    if value is None:
        return "\x0A" + name
    if isinstance(value, RE_TYPE):
        pattern = value.pattern
        flags = ""
        if value.flags & re.IGNORECASE:
            flags += "i"
        if value.flags & re.LOCALE:
            flags += "l"
        if value.flags & re.MULTILINE:
            flags += "m"
        if value.flags & re.DOTALL:
            flags += "s"
        if value.flags & re.UNICODE:
            flags += "u"
        if value.flags & re.VERBOSE:
            flags += "x"
        return "\x0B" + name + _make_c_string(pattern, True) + \
            _make_c_string(flags)
    if isinstance(value, DBRef):
        return _element_to_bson(key, value.as_doc(), False, uuid_subtype)
    if isinstance(value, MinKey):
        return "\xFF" + name
    if isinstance(value, MaxKey):
        return "\x7F" + name

    raise InvalidDocument("cannot convert value of type %s to bson" %
                          type(value))
Пример #25
0
def test_read_object_backwards_compat():
    self = create_autospec(PickleStore)
    version = {'blob': Binary(compressHC(cPickle.dumps(object)))}
    assert PickleStore.read(self, sentinel.arctic_lib, version,
                            sentinel.symbol) == object
Пример #26
0
    def write(self, symbol, item, chunker=DateChunker(), **kwargs):
        """
        Writes data from item to symbol in the database

        Parameters
        ----------
        symbol: str
            the symbol that will be used to reference the written data
        item: Dataframe or Series
            the data to write the database
        chunker: Object of type Chunker
            A chunker that chunks the data in item
        kwargs:
            optional keyword args that are passed to the chunker. Includes:
            chunk_size:
                used by chunker to break data into discrete chunks.
                see specific chunkers for more information about this param.
        """
        if not isinstance(item, (DataFrame, Series)):
            raise Exception("Can only chunk DataFrames and Series")

        self._arctic_lib.check_quota()

        previous_shas = []
        doc = {}

        doc[SYMBOL] = symbol
        doc[LEN] = len(item)
        doc[SERIALIZER] = self.serializer.TYPE
        doc[CHUNKER] = chunker.TYPE

        sym = self._get_symbol_info(symbol)
        if sym:
            previous_shas = set([
                Binary(x[SHA]) for x in self._collection.find(
                    {SYMBOL: symbol},
                    projection={
                        SHA: True,
                        '_id': False
                    },
                )
            ])

        op = False
        bulk = self._collection.initialize_unordered_bulk_op()
        chunk_count = 0

        for start, end, chunk_size, record in chunker.to_chunks(
                item, **kwargs):
            chunk_count += 1
            data = self.serializer.serialize(record)
            doc[METADATA] = {
                'columns':
                data[METADATA][COLUMNS] if COLUMNS in data[METADATA] else ''
            }
            doc[CHUNK_SIZE] = chunk_size

            size_chunked = len(data[DATA]) > MAX_CHUNK_SIZE
            for i in xrange(int(len(data[DATA]) / MAX_CHUNK_SIZE + 1)):
                chunk = {
                    DATA:
                    Binary(data[DATA][i * MAX_CHUNK_SIZE:(i + 1) *
                                      MAX_CHUNK_SIZE])
                }
                chunk[METADATA] = data[METADATA]
                if size_chunked:
                    chunk[SEGMENT] = i
                else:
                    chunk[SEGMENT] = -1
                chunk[START] = start
                chunk[END] = end
                chunk[SYMBOL] = symbol
                dates = [
                    chunker.chunk_to_str(start),
                    chunker.chunk_to_str(end),
                    str(chunk[SEGMENT]).encode('ascii')
                ]
                chunk[SHA] = self._checksum(dates, chunk[DATA])
                if chunk[SHA] not in previous_shas:
                    op = True
                    find = {
                        SYMBOL: symbol,
                        START: start,
                        END: end,
                        SEGMENT: chunk[SEGMENT]
                    }
                    bulk.find(find).upsert().update_one({'$set': chunk})
                else:
                    # already exists, dont need to update in mongo
                    previous_shas.remove(chunk[SHA])
        if op:
            bulk.execute()

        doc[CHUNK_COUNT] = chunk_count
        doc[APPEND_COUNT] = 0

        if previous_shas:
            mongo_retry(self._collection.delete_many)({
                SYMBOL: symbol,
                SHA: {
                    '$in': list(previous_shas)
                }
            })

        mongo_retry(self._symbols.update_one)({
            SYMBOL: symbol
        }, {
            '$set': doc
        },
                                              upsert=True)
Пример #27
0
    def test_uuid_representation(self):
        coll = self.db.uuid
        coll.drop()

        # Test property
        self.assertEqual(UuidRepresentation.UNSPECIFIED,
                         coll.codec_options.uuid_representation)

        # Test basic query
        uu = uuid.uuid4()
        # Insert as binary subtype 3
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY))
        legacy_opts = coll.codec_options
        coll.insert_one({'uu': uu})
        self.assertEqual(uu, coll.find_one({'uu': uu})['uu'])
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=STANDARD))
        self.assertEqual(STANDARD, coll.codec_options.uuid_representation)
        self.assertEqual(None, coll.find_one({'uu': uu}))
        uul = Binary.from_uuid(uu, PYTHON_LEGACY)
        self.assertEqual(uul, coll.find_one({'uu': uul})['uu'])

        # Test count_documents
        self.assertEqual(0, coll.count_documents({'uu': uu}))
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY))
        self.assertEqual(1, coll.count_documents({'uu': uu}))

        # Test delete
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=STANDARD))
        coll.delete_one({'uu': uu})
        self.assertEqual(1, coll.count_documents({}))
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY))
        coll.delete_one({'uu': uu})
        self.assertEqual(0, coll.count_documents({}))

        # Test update_one
        coll.insert_one({'_id': uu, 'i': 1})
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=STANDARD))
        coll.update_one({'_id': uu}, {'$set': {'i': 2}})
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY))
        self.assertEqual(1, coll.find_one({'_id': uu})['i'])
        coll.update_one({'_id': uu}, {'$set': {'i': 2}})
        self.assertEqual(2, coll.find_one({'_id': uu})['i'])

        # Test Cursor.distinct
        self.assertEqual([2], coll.find({'_id': uu}).distinct('i'))
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=STANDARD))
        self.assertEqual([], coll.find({'_id': uu}).distinct('i'))

        # Test findAndModify
        self.assertEqual(
            None, coll.find_one_and_update({'_id': uu}, {'$set': {
                'i': 5
            }}))
        coll = self.db.get_collection(
            "uuid", CodecOptions(uuid_representation=PYTHON_LEGACY))
        self.assertEqual(
            2,
            coll.find_one_and_update({'_id': uu}, {'$set': {
                'i': 5
            }})['i'])
        self.assertEqual(5, coll.find_one({'_id': uu})['i'])

        # Test command
        self.assertEqual(
            5,
            self.db.command('findAndModify',
                            'uuid',
                            update={'$set': {
                                'i': 6
                            }},
                            query={'_id': uu},
                            codec_options=legacy_opts)['value']['i'])
        self.assertEqual(
            6,
            self.db.command('findAndModify',
                            'uuid',
                            update={'$set': {
                                'i': 7
                            }},
                            query={'_id': Binary.from_uuid(uu, PYTHON_LEGACY)
                                   })['value']['i'])
Пример #28
0
    def __update(self, sym, item, combine_method=None, chunk_range=None):
        '''
        helper method used by update and append since they very closely
        resemble eachother. Really differ only by the combine method.
        append will combine existing date with new data (within a chunk),
        whereas update will replace existing data with new data (within a
        chunk).
        '''
        if not isinstance(item, (DataFrame, Series)):
            raise Exception("Can only chunk DataFrames and Series")

        self._arctic_lib.check_quota()

        symbol = sym[SYMBOL]

        if chunk_range is not None:
            self.delete(symbol, chunk_range)
            sym = self._get_symbol_info(symbol)

        bulk = self._collection.initialize_unordered_bulk_op()
        op = False
        chunker = CHUNKER_MAP[sym[CHUNKER]]

        for start, end, _, record in chunker.to_chunks(
                item, chunk_size=sym[CHUNK_SIZE]):
            # read out matching chunks
            df = self.read(symbol,
                           chunk_range=chunker.to_range(start, end),
                           filter_data=False)
            # assuming they exist, update them and store the original chunk
            # range for later use
            if len(df) > 0:
                record = combine_method(df, record)
                if record is None or record.equals(df):
                    continue

                sym[APPEND_COUNT] += len(record)
                sym[LEN] += len(record) - len(df)
            else:
                sym[CHUNK_COUNT] += 1
                sym[LEN] += len(record)

            data = SER_MAP[sym[SERIALIZER]].serialize(record)
            op = True

            # remove old segments for this chunk in case we now have less
            # segments than we did before
            chunk_count = int(len(data[DATA]) / MAX_CHUNK_SIZE + 1)
            seg_count = self._collection.count({
                SYMBOL: symbol,
                START: start,
                END: end
            })
            if seg_count > chunk_count:
                # if chunk count is 1, the segment id will be -1, not 1
                self._collection.delete_many({
                    SYMBOL: symbol,
                    START: start,
                    END: end,
                    SEGMENT: {
                        '$gt': seg_count if chunk_count > 1 else -1
                    }
                })

            size_chunked = chunk_count > 1
            for i in xrange(chunk_count):
                chunk = {
                    DATA:
                    Binary(data[DATA][i * MAX_CHUNK_SIZE:(i + 1) *
                                      MAX_CHUNK_SIZE])
                }
                chunk[METADATA] = data[METADATA]
                if size_chunked:
                    chunk[SEGMENT] = i
                else:
                    chunk[SEGMENT] = -1
                chunk[START] = start
                chunk[END] = end
                chunk[SYMBOL] = symbol
                dates = [
                    chunker.chunk_to_str(start),
                    chunker.chunk_to_str(end),
                    str(chunk[SEGMENT]).encode('ascii')
                ]
                sha = self._checksum(dates, data[DATA])
                chunk[SHA] = sha
                bulk.find({
                    SYMBOL: symbol,
                    START: start,
                    END: end,
                    SEGMENT: chunk[SEGMENT]
                }).upsert().update_one({'$set': chunk})
        if op:
            bulk.execute()

        self._symbols.replace_one({SYMBOL: symbol}, sym)
Пример #29
0
def train_model(recom_vars_obj):
    '''
    This function takes the variable holder class as input and then loads the data for both the latent matrices from the database. Verifies for each user, and then, if any new user is found, adds rows and columns for them in the corresponding U and M matrices. The values are picked from a Normal Gaussian centered at 0 with S.D. 1
    After loading of the data, it calls the gradient descent function matrix_factorisation, which returns the updated matrices U and M. It then loads the updated values into the database.
    '''
    movie_counter = 0
    if recom_vars_obj.np_arrays.find({'name': 'U'}).count() > 0:
        U = pickle.loads(
            recom_vars_obj.np_arrays.find({
                'name': 'U'
            }).next()['matrix'])
        M = pickle.loads(
            recom_vars_obj.np_arrays.find({
                'name': 'M'
            }).next()['matrix'])
        user_counter = len(U)
        movie_counter = len(M)
        for movie in recom_vars_obj.movies.find():
            try:
                recom_vars_obj.movie_index_dict[
                    movie['_id']] = movie['matrix_index']
            except:
                recom_vars_obj.movie_index_dict[movie['_id']] = movie_counter
                recom_vars_obj.movies.update(
                    {'_id': movie['_id']},
                    {'$set': {
                        'matrix_index': movie_counter
                    }})
                M = np.vstack([M, np.random.normal(0, 1, [1, k])])
                movie_counter += 1
        for user in recom_vars_obj.users.find():
            try:
                recom_vars_obj.user_index_dict[
                    user['_id']] = user['matrix_index']
            except:
                recom_vars_obj.user_index_dict[user['_id']] = user_counter
                recom_vars_obj.users.update(
                    {'_id': user['_id']},
                    {'$set': {
                        'matrix_index': user_counter
                    }})
                U = np.vstack([U, np.random.normal(0, 1, [1, k])])
                user_counter += 1
        recom_vars_obj.ratings_array = np.empty((user_counter, movie_counter))
        recom_vars_obj.ratings_array.fill(np.nan)
        rating_indices = []
        for rating in recom_vars_obj.ratings.find():
            rating_indices.append([
                recom_vars_obj.user_index_dict[rating['user_id']],
                recom_vars_obj.movie_index_dict[rating['movie_id']]
            ])
            recom_vars_obj.ratings_array[
                recom_vars_obj.user_index_dict[rating['user_id']],
                recom_vars_obj.
                movie_index_dict[rating['movie_id']]] = rating['rating']
        new_U, new_M = matrix_factorisation(recom_vars_obj.ratings_array,
                                            user_counter, movie_counter,
                                            rating_indices, U, M)
    else:
        for movie in recom_vars_obj.movies.find():
            recom_vars_obj.movie_index_dict[movie['_id']] = movie_counter
            recom_vars_obj.movies.update(
                {'_id': movie['_id']},
                {'$set': {
                    'matrix_index': movie_counter
                }})
            movie_counter += 1
        user_counter = 0
        for user in recom_vars_obj.users.find():
            recom_vars_obj.user_index_dict[user['_id']] = user_counter
            recom_vars_obj.users.update(
                {'_id': user['_id']}, {'$set': {
                    'matrix_index': user_counter
                }})
            user_counter += 1
        recom_vars_obj.ratings_array = np.empty((user_counter, movie_counter))
        recom_vars_obj.ratings_array.fill(np.nan)
        rating_indices = []
        print("4")
        for rating in recom_vars_obj.ratings.find():
            rating_indices.append([
                recom_vars_obj.user_index_dict[rating['user_id']],
                recom_vars_obj.movie_index_dict[rating['movie_id']]
            ])
            recom_vars_obj.ratings_array[
                recom_vars_obj.user_index_dict[rating['user_id']],
                recom_vars_obj.
                movie_index_dict[rating['movie_id']]] = rating['rating']
        new_U, new_M = matrix_factorisation(recom_vars_obj.ratings_array,
                                            user_counter, movie_counter,
                                            rating_indices)
    recom_vars_obj.np_arrays.update(
        {'name': 'U'}, {'$set': {
            'matrix': Binary(pickle.dumps(new_U))
        }},
        upsert=True)
    recom_vars_obj.np_arrays.update({'name': 'U_len'},
                                    {'$set': {
                                        'value': len(U)
                                    }},
                                    upsert=True)
    recom_vars_obj.np_arrays.update(
        {'name': 'M'}, {'$set': {
            'matrix': Binary(pickle.dumps(new_M))
        }},
        upsert=True)
    recom_vars_obj.np_arrays.update({'name': 'M_len'},
                                    {'$set': {
                                        'value': len(M)
                                    }},
                                    upsert=True)
Пример #30
0
		if z in ["_id","title"]:
			continue
		for k in j:
			#k[0]->term_id k[1]->doc_id k[2]->term_freq
			A[k[0]][k[1]]=k[2]
			
#print linalg.det(A)
print A

#n=6;m=4
term_freq=[sum(A[i]) for i in xrange(m)]
entropy=[]
for i in xrange(m):
	s=0
	for j in xrange(n):
		pij=A[i][j]/term_freq[i]
		if pij==0:
			continue
		s+=pij*np.log10(pij)
	s=s/(np.log10(n))
	entropy.append(s+1)
print "\nentropy\n",entropy,"\n"

A=np.array([ [entropy[i]*np.log10(A[i][j]+1) for j in xrange(n)] for i in xrange(m)])
print A
val=(Binary(pickle.dumps(A, protocol=2), subtype=128 ))
mycol = mydb["log_en"]
mycol.insert({"key":val})


Пример #31
0
    def write(self, symbol, item, chunk_size):
        """
        Writes data from item to symbol in the database

        Parameters
        ----------
        symbol: str
            the symbol that will be used to reference the written data
        item: dataframe or series
            the data to write the database
        chunk_size: ?
            A chunk size that is understood by the specified chunker
        """

        doc = {}
        doc['symbol'] = symbol
        doc['chunk_size'] = chunk_size

        if isinstance(item, Series):
            doc['type'] = SeriesSerializer.TYPE
        elif isinstance(item, DataFrame):
            doc['type'] = DataFrameSerializer.TYPE
        else:
            raise Exception("Can only chunk Series and DataFrames")

        previous_shas = []
        if self._get_symbol_info(symbol):
            previous_shas = set([
                Binary(x['sha']) for x in self._collection.find(
                    {'symbol': symbol},
                    projection={
                        'sha': True,
                        '_id': False
                    },
                )
            ])
        records = []
        ranges = []
        dtype = None

        for start, end, record in self.chunker.to_chunks(item, chunk_size):
            r, dtype = serialize(record, string_max_len=self.STRING_MAX)
            records.append(r)
            ranges.append((start, end))

        item = np.array([r for record in records for r in record]).flatten()
        for record in records:
            if record.dtype.hasobject:
                raise UnhandledDtypeException()

        doc['dtype'] = str(dtype)
        doc['shape'] = (-1, ) + item.shape[1:]
        doc['dtype_metadata'] = dict(dtype.metadata or {})
        doc['len'] = len(item)

        chunks = [r.tostring() for r in records]
        chunks = compress_array(chunks)

        op = False
        bulk = self._collection.initialize_unordered_bulk_op()
        for chunk, rng in zip(chunks, ranges):
            start = rng[0]
            end = rng[1]
            chunk = {'data': Binary(chunk)}
            chunk['start'] = start
            chunk['end'] = end
            chunk['symbol'] = symbol
            chunk['sha'] = checksum(symbol, chunk)

            if chunk['sha'] not in previous_shas:
                op = True
                bulk.find({
                    'symbol': symbol,
                    'sha': chunk['sha']
                }, ).upsert().update_one({'$set': chunk})
            else:
                # already exists, dont need to update in mongo
                previous_shas.remove(chunk['sha'])
        if op:
            bulk.execute()

        doc['chunk_count'] = len(chunks)
        doc['append_size'] = 0
        doc['append_count'] = 0

        if previous_shas:
            mongo_retry(self._collection.delete_many)({
                'sha': {
                    '$in': list(previous_shas)
                }
            })

        mongo_retry(self._symbols.update_one)({
            'symbol': symbol
        }, {
            '$set': doc
        },
                                              upsert=True)
Пример #32
0
 def __setitem__(self, url, result):
     record = {
         'result': Binary(zlib.compress(pickle.dumps(result))),
         'timestamp': datetime.utcnow()
     }
     self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)
Пример #33
0
    def update(self, symbol, item):
        """
        Merges data from item onto existing data in the database for symbol
        data that exists in symbol and item for the same index/multiindex will
        be overwritten by the data in item.

        Parameters
        ----------
        symbol: str
            the symbol for the given item in the DB
        item:
            the data to update
        """

        sym = self._get_symbol_info(symbol)
        if not sym:
            raise NoDataFoundException("Symbol does not exist. Cannot update")

        records = []
        ranges = []
        orig_ranges = []
        for start, end, record in self.chunker.to_chunks(
                item, sym['chunk_size']):
            # read out matching chunks
            df = self.read(symbol,
                           chunk_range=self.chunker.to_range(start, end))
            # assuming they exist, update them and store the original chunk
            # range for later use
            if not df.empty:
                if df.equals(record):
                    continue
                record = record.combine_first(df)
                orig_ranges.append((self.chunker.to_start_end(record)))
            else:
                orig_ranges.append((None, None))

            r, _ = serialize(record, string_max_len=self.STRING_MAX)
            records.append(r)
            ranges.append((start, end))

        if len(records) > 0:
            chunks = [r.tostring() for r in records]
            lens = [len(i) for i in chunks]
            chunks = compress_array(chunks)

            seg_count = 0
            seg_len = 0

            bulk = self._collection.initialize_unordered_bulk_op()
            for chunk, rng, orig_rng, rec_len in zip(chunks, ranges,
                                                     orig_ranges, lens):
                start = rng[0]
                end = rng[1]
                orig_start = orig_rng[0]
                if orig_start is None:
                    sym['len'] += rec_len
                    seg_count += 1
                    seg_len += rec_len
                segment = {'data': Binary(chunk)}
                segment['start'] = start
                segment['end'] = end
                sha = checksum(symbol, segment)
                segment['sha'] = sha
                if orig_start is None:
                    # new chunk
                    bulk.find({
                        'symbol': symbol,
                        'sha': sha,
                        'start': segment['start']
                    }).upsert().update_one({'$set': segment})
                else:
                    bulk.find({
                        'symbol': symbol,
                        'start': orig_start
                    }).update_one({'$set': segment})
            if len(chunks) > 0:
                bulk.execute()

            if seg_count != 0:
                sym['chunk_count'] += seg_count
                sym['append_size'] += seg_len
                sym['append_count'] += seg_count
            self._symbols.replace_one({'symbol': symbol}, sym)
    def run_operation(self, sessions, collection, operation):
        original_collection = collection
        name = camel_to_snake(operation['name'])
        if name == 'run_command':
            name = 'command'
        elif name == 'download_by_name':
            name = 'open_download_stream_by_name'
        elif name == 'download':
            name = 'open_download_stream'

        database = collection.database
        collection = database.get_collection(collection.name)
        if 'collectionOptions' in operation:
            collection = collection.with_options(
                **self.parse_options(operation['collectionOptions']))

        object_name = self.get_object_name(operation)
        if object_name == 'gridfsbucket':
            # Only create the GridFSBucket when we need it (for the gridfs
            # retryable reads tests).
            obj = GridFSBucket(database,
                               bucket_name=collection.name,
                               disable_md5=True)
        else:
            objects = {
                'client': database.client,
                'database': database,
                'collection': collection,
                'testRunner': self
            }
            objects.update(sessions)
            obj = objects[object_name]

        # Combine arguments with options and handle special cases.
        arguments = operation.get('arguments', {})
        arguments.update(arguments.pop("options", {}))
        self.parse_options(arguments)

        cmd = getattr(obj, name)

        with_txn_callback = functools.partial(self.run_operations,
                                              sessions,
                                              original_collection,
                                              in_with_transaction=True)
        prepare_spec_arguments(operation, arguments, name, sessions,
                               with_txn_callback)

        if name == 'run_on_thread':
            args = {'sessions': sessions, 'collection': collection}
            args.update(arguments)
            arguments = args
        result = cmd(**dict(arguments))

        if name == "aggregate":
            if arguments["pipeline"] and "$out" in arguments["pipeline"][-1]:
                # Read from the primary to ensure causal consistency.
                out = collection.database.get_collection(
                    arguments["pipeline"][-1]["$out"],
                    read_preference=ReadPreference.PRIMARY)
                return out.find()
        if name == "map_reduce":
            if isinstance(result, dict) and 'results' in result:
                return result['results']
        if 'download' in name:
            result = Binary(result.read())

        if isinstance(result, Cursor) or isinstance(result, CommandCursor):
            return list(result)

        return result
Пример #35
0
def object_hook(dct, compile_re=True):
    if "$oid" in dct:
        return ObjectId(str(dct["$oid"]))
    if "$ref" in dct:
        return DBRef(dct["$ref"], dct["$id"], dct.get("$db", None))
    if "$date" in dct:
        dtm = dct["$date"]
        # mongoexport 2.6 and newer
        if isinstance(dtm, str):
            # datetime.datetime.strptime is new in python 2.5
            naive = datetime.datetime(
                *(time.strptime(dtm[:19], "%Y-%m-%dT%H:%M:%S")[0:6]))
            # The %f format is new in python 2.6
            micros = int(dtm[20:23]) * 1000
            aware = naive.replace(microsecond=micros, tzinfo=utc)
            offset = dtm[23:]
            if not offset or offset == 'Z':
                # UTC
                return aware
            else:
                if len(offset) == 5:
                    # Offset from mongoexport is in format (+|-)HHMM
                    secs = (int(offset[1:3]) * 3600 + int(offset[3:]) * 60)
                elif ':' in offset and len(offset) == 6:
                    # RFC-3339 format (+|-)HH:MM
                    hours, minutes = offset[1:].split(':')
                    secs = (int(hours) * 3600 + int(minutes) * 60)
                else:
                    # Not RFC-3339 compliant or mongoexport output.
                    raise ValueError("invalid format for offset")
                if offset[0] == "-":
                    secs *= -1
                return aware - datetime.timedelta(seconds=secs)
        # mongoexport 2.6 and newer, time before the epoch (SERVER-15275)
        elif isinstance(dtm, dict):
            secs = float(dtm["$numberLong"]) / 1000.0
        # mongoexport before 2.6
        else:
            secs = float(dtm) / 1000.0
        return EPOCH_AWARE + datetime.timedelta(seconds=secs)
    if "$regex" in dct:
        flags = 0
        # PyMongo always adds $options but some other tools may not.
        for opt in dct.get("$options", ""):
            flags |= _RE_OPT_TABLE.get(opt, 0)

        if compile_re:
            return re.compile(dct["$regex"], flags)
        else:
            return Regex(dct["$regex"], flags)
    if "$minKey" in dct:
        return MinKey()
    if "$maxKey" in dct:
        return MaxKey()
    if "$binary" in dct:
        if isinstance(dct["$type"], int):
            dct["$type"] = "%02x" % dct["$type"]
        subtype = int(dct["$type"], 16)
        if subtype >= 0xffffff80:  # Handle mongoexport values
            subtype = int(dct["$type"][6:], 16)
        return Binary(base64.b64decode(dct["$binary"].encode()), subtype)
    if "$code" in dct:
        return Code(dct["$code"], dct.get("$scope"))
    if bson.has_uuid() and "$uuid" in dct:
        return bson.uuid.UUID(dct["$uuid"])
    if "$undefined" in dct:
        return None
    if "$numberLong" in dct:
        # 2to3 will change this to int. PyMongo 3.0 supports
        # a new type, Int64, to avoid round trip issues.
        return int(dct["$numberLong"])
    if "$timestamp" in dct:
        tsp = dct["$timestamp"]
        return Timestamp(tsp["t"], tsp["i"])
    return dct
Пример #36
0
    def run_operation(self, sessions, collection, operation):
        original_collection = collection
        name = camel_to_snake(operation['name'])
        if name == 'run_command':
            name = 'command'
        elif name == 'download_by_name':
            name = 'open_download_stream_by_name'
        elif name == 'download':
            name = 'open_download_stream'

        database = collection.database
        collection = database.get_collection(collection.name)
        if 'collectionOptions' in operation:
            collection = collection.with_options(
                **self.parse_options(operation['collectionOptions']))

        object_name = self.get_object_name(operation)
        if object_name == 'gridfsbucket':
            # Only create the GridFSBucket when we need it (for the gridfs
            # retryable reads tests).
            obj = GridFSBucket(database,
                               bucket_name=collection.name,
                               disable_md5=True)
        else:
            objects = {
                'client': database.client,
                'database': database,
                'collection': collection,
                'testRunner': self
            }
            objects.update(sessions)
            obj = objects[object_name]

        # Combine arguments with options and handle special cases.
        arguments = operation.get('arguments', {})
        arguments.update(arguments.pop("options", {}))
        self.parse_options(arguments)

        cmd = getattr(obj, name)

        for arg_name in list(arguments):
            c2s = camel_to_snake(arg_name)
            # PyMongo accepts sort as list of tuples.
            if arg_name == "sort":
                sort_dict = arguments[arg_name]
                arguments[arg_name] = list(iteritems(sort_dict))
            # Named "key" instead not fieldName.
            if arg_name == "fieldName":
                arguments["key"] = arguments.pop(arg_name)
            # Aggregate uses "batchSize", while find uses batch_size.
            elif ((arg_name == "batchSize" or arg_name == "allowDiskUse")
                  and name == "aggregate"):
                continue
            # Requires boolean returnDocument.
            elif arg_name == "returnDocument":
                arguments[c2s] = arguments.pop(arg_name) == "After"
            elif c2s == "requests":
                # Parse each request into a bulk write model.
                requests = []
                for request in arguments["requests"]:
                    bulk_model = camel_to_upper_camel(request["name"])
                    bulk_class = getattr(operations, bulk_model)
                    bulk_arguments = camel_to_snake_args(request["arguments"])
                    requests.append(bulk_class(**dict(bulk_arguments)))
                arguments["requests"] = requests
            elif arg_name == "session":
                arguments['session'] = sessions[arguments['session']]
            elif (name in ('command', 'run_admin_command')
                  and arg_name == 'command'):
                # Ensure the first key is the command name.
                ordered_command = SON([(operation['command_name'], 1)])
                ordered_command.update(arguments['command'])
                arguments['command'] = ordered_command
            elif name == 'open_download_stream' and arg_name == 'id':
                arguments['file_id'] = arguments.pop(arg_name)
            elif name != 'find' and c2s == 'max_time_ms':
                # find is the only method that accepts snake_case max_time_ms.
                # All other methods take kwargs which must use the server's
                # camelCase maxTimeMS. See PYTHON-1855.
                arguments['maxTimeMS'] = arguments.pop('max_time_ms')
            elif name == 'with_transaction' and arg_name == 'callback':
                callback_ops = arguments[arg_name]['operations']
                arguments['callback'] = lambda _: self.run_operations(
                    sessions,
                    original_collection,
                    copy.deepcopy(callback_ops),
                    in_with_transaction=True)
            elif name == 'drop_collection' and arg_name == 'collection':
                arguments['name_or_collection'] = arguments.pop(arg_name)
            elif name == 'create_collection' and arg_name == 'collection':
                arguments['name'] = arguments.pop(arg_name)
            elif name == 'create_index' and arg_name == 'keys':
                arguments['keys'] = list(arguments.pop(arg_name).items())
            elif name == 'drop_index' and arg_name == 'name':
                arguments['index_or_name'] = arguments.pop(arg_name)
            else:
                arguments[c2s] = arguments.pop(arg_name)

        if name == 'run_on_thread':
            args = {'sessions': sessions, 'collection': collection}
            args.update(arguments)
            arguments = args
        result = cmd(**dict(arguments))

        if name == "aggregate":
            if arguments["pipeline"] and "$out" in arguments["pipeline"][-1]:
                # Read from the primary to ensure causal consistency.
                out = collection.database.get_collection(
                    arguments["pipeline"][-1]["$out"],
                    read_preference=ReadPreference.PRIMARY)
                return out.find()
        if name == "map_reduce":
            if isinstance(result, dict) and 'results' in result:
                return result['results']
        if 'download' in name:
            result = Binary(result.read())

        if isinstance(result, Cursor) or isinstance(result, CommandCursor):
            return list(result)

        return result