Exemplo n.º 1
0
    async def create_mongodb_doc(self, doc, mime_type, extension,
                                 mongodb_doc_id, mongodb):

        # mongodb document is created with required fields.
        soup = BeautifulSoup(doc, 'html.parser', parse_only=self.only_html)
        try:
            url = soup.singularity['href']
        except Exception as e:
            print(e)
            return

        print(
            "33333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333"
        )
        data = {
            '_id': mongodb_doc_id,
            # TODO file_type to select dynamically
            'mime_type': mime_type,
            'url': url,
            'file_extension': extension,
            'file_data': Binary(bytes(doc, "utf-8")),
            'parsed': "False",
        }

        await self.insert_mongodb_doc("https3", data, mongodb)
Exemplo n.º 2
0
 def run(self):
     while True:
         current_url = redis_client.lpop('m_sohu_task')
         while not current_url:
             current_url = redis_client.lpop('m_sohu_task')
         self.spider.status = SpiderStatus.WORKING
         current_url = current_url.decode('utf-8')
         if not redis_client.sismember('visited_urls', current_url):
             redis_client.sadd('visited_urls', current_url)
             html_page = self.spider.fetch(current_url)
             if html_page not in [None, '']:
                 hasher = hasher_proto.copy()
                 hasher.update(current_url.encode('utf-8'))
                 doc_id = hasher.hexdigest()
                 if not sohu_data_coll.find_one({'_id': doc_id}):
                     sohu_data_coll.insert_one({
                         '_id':
                         doc_id,
                         'url':
                         current_url,
                         'page':
                         Binary(zlib.compress(pickle.dumps(html_page)))
                     })
                 self.spider.parse(html_page)
         self.spider.status = SpiderStatus.IDLE
Exemplo n.º 3
0
def store_files(userid):
    oldcwd = os.getcwd()
    os.chdir("/Users/WillMichael/Documents/git/FlaskFarmer/project/tests")
    file_paths = [
        "../server/files/Mock/1/APPENDIX A- Recall Team_MR.xlsx",
        "../server/files/Mock/1/APPENDIX B-Agency-Press-Supplier-Customer Contact List_MR.xlsx",
        "../server/files/Mock/1/APPENDIX D-General Communication Log_MR.xlsx",
        "../server/files/Mock/1/APPENDIX H-Ingredients Receipts Record_MR.xlsx",
        "../server/files/Mock/1/APPENDIX I-Production Batch Sheet_MR.xlsx",
        "../server/files/Mock/1/APPENDIX K-Product Distribution record_MR.xlsx",
        "../server/files/Mock/1/APPENDIX N-Product Reconciliation_MR.xlsx",
        "../server/files/Mock/1/Mock Recall/APPENDIX G1-Mock Recall Record_MR.xlsx",
        "../server/files/Mock/1/Mock Recall/APPENDIX G2-Mock Recall Log_MR.xlsx",
        "../server/files/Mock/1/Mock Recall/APPENDIX O3-Recall Notification via Phone_MR.docx"
    ]

    for idx, fp in enumerate(file_paths):
        fo = open(fp, 'r')
        bin_file = fo.read()

        data = {"userid": userid, "docid": idx, "data": Binary(bin_file)}

        mongo.db.documents.insert_one(data)

    os.chdir(oldcwd)
Exemplo n.º 4
0
 def test_del_item_should_delete_pair_in_the_collection(self):
     self.collection.insert({
         '_id': 'testing',
         'value': Binary(encode('123'))
     })
     self.collection.insert({
         '_id': 'bla bla bla',
         'value': Binary(encode('3.14'))
     })
     my_dict = MongoDict(**self.config)
     del my_dict['testing']
     results = list(self.collection.find())
     self.assertEqual(results[0]['_id'], 'bla bla bla')
     self.assertEqual(decode(results[0]['value']), '3.14')
     with self.assertRaises(KeyError):
         del my_dict['non ecxiste']
Exemplo n.º 5
0
def _convert_data_for_mongo(data):
    if isinstance(data, dict):
        result = dict()
        for key, value in data.iteritems():
            result[str(key).replace('.', '_')] = _convert_data_for_mongo(value)
        return result
    if isinstance(data, np.ndarray):
        return data.tolist()
    if isinstance(data, np.float32):
        return float(data)
    if isinstance(data, list):
        return [_convert_data_for_mongo(x) for x in data]
    if isinstance(data, Decimal):
        return float(data)
    if isinstance(data, dt.datetime):
        return localize_datetime(data)
    if isinstance(data, dt.date):
        return localize_date(data)
    if isinstance(data, str):
        try:
            return data.decode('utf8')
        except Exception as e:
            logging.warning(e.message)
            return Binary(data, 0)
    return data
Exemplo n.º 6
0
    def test_bson_classes(self):
        _id = '5a918f9fa08bff9c7688d3e1'

        for a, b in [
            (Binary(b'foo'), mockup_bson.Binary(b'foo')),
            (Code('foo'), mockup_bson.Code('foo')),
            (Code('foo', {'x': 1}), mockup_bson.Code('foo', {'x': 1})),
            (DBRef('coll', 1), mockup_bson.DBRef('coll', 1)),
            (DBRef('coll', 1, 'db'), mockup_bson.DBRef('coll', 1, 'db')),
            (Decimal128('1'), mockup_bson.Decimal128('1')),
            (MaxKey(), mockup_bson.MaxKey()),
            (MinKey(), mockup_bson.MinKey()),
            (ObjectId(_id), mockup_bson.ObjectId(_id)),
            (Regex('foo', 'i'), mockup_bson.Regex('foo', 'i')),
            (Timestamp(1, 2), mockup_bson.Timestamp(1, 2)),
        ]:
            # Basic case.
            self.assertTrue(
                Matcher(Command(y=b)).matches(Command(y=b)),
                "MockupDB %r doesn't equal itself" % (b, ))

            # First Command argument is special, try comparing the second also.
            self.assertTrue(
                Matcher(Command('x', y=b)).matches(Command('x', y=b)),
                "MockupDB %r doesn't equal itself" % (b, ))

            # In practice, users pass PyMongo classes in message specs.
            self.assertTrue(
                Matcher(Command(y=b)).matches(Command(y=a)),
                "PyMongo %r != MockupDB %r" % (a, b))

            self.assertTrue(
                Matcher(Command('x', y=b)).matches(Command('x', y=a)),
                "PyMongo %r != MockupDB %r" % (a, b))
Exemplo n.º 7
0
 def run(self):
     redis_client = redis.Redis(host='1.2.3.4', port=6379, password='******')
     mongo_client = pymongo.MongoClient(host='1.2.3.4', port=27017)
     thread_local.redis_client = redis_client
     thread_local.mongo_db = mongo_client.msohu
     while True:
         current_url = redis_client.lpop('m_sohu_task')
         while not current_url:
             current_url = redis_client.lpop('m_sohu_task')
         self.spider.status = SpiderStatus.WORKING
         current_url = current_url.decode('utf-8')
         if not redis_client.sismember('visited_urls', current_url):
             redis_client.sadd('visited_urls', current_url)
             html_page = self.spider.fetch(current_url)
             if html_page not in [None, '']:
                 hasher = hasher_proto.copy()
                 hasher.update(current_url.encode('utf-8'))
                 doc_id = hasher.hexdigest()
                 sohu_data_coll = mongo_client.msohu.webpages
                 if not sohu_data_coll.find_one({'_id': doc_id}):
                     sohu_data_coll.insert_one({
                         '_id': doc_id,
                         'url': current_url,
                         'page': Binary(zlib.compress(pickle.dumps(html_page)))
                     })
                 self.spider.parse(html_page)
         self.spider.status = SpiderStatus.IDLE
Exemplo n.º 8
0
    def docify(self, df):
        """
        Convert a Pandas DataFrame to SON.

        Parameters
        ----------
        df:  DataFrame
            The Pandas DataFrame to encode
        """
        dtypes = {}
        masks = {}
        lengths = {}
        columns = []
        data = Binary(b'')
        start = 0

        arrays = []
        for c in df:
            try:
                columns.append(str(c))
                arr, mask = self._convert_types(df[c].values)
                dtypes[str(c)] = arr.dtype.str
                if mask is not None:
                    masks[str(c)] = Binary(compress(mask.tostring()))
                arrays.append(arr.tostring())
            except Exception as e:
                typ = infer_dtype(df[c], skipna=False)
                msg = "Column '{}' type is {}".format(str(c), typ)
                logging.warning(msg)
                raise e

        arrays = compress_array(arrays)
        for index, c in enumerate(df):
            d = Binary(arrays[index])
            lengths[str(c)] = (start, start + len(d) - 1)
            start += len(d)
            data += d

        doc = SON({DATA: data, METADATA: {}})
        doc[METADATA] = {
            COLUMNS: columns,
            MASK: masks,
            LENGTHS: lengths,
            DTYPE: dtypes
        }

        return doc
Exemplo n.º 9
0
    def run_scenario(self):

        # Run tests.
        self.assertTrue(scenario_def['tests'], "tests cannot be empty")
        for test in scenario_def['tests']:
            self.init_db(scenario_def['data'], test)

            # Run GridFs Operation.
            operation = self.str_to_cmd[test['act']['operation']]
            args = test['act']['arguments']
            extra_opts = args.pop("options", {})
            if "contentType" in extra_opts:
                extra_opts["metadata"] = {
                    "contentType": extra_opts.pop("contentType")}

            args.update(extra_opts)

            converted_args = dict((camel_to_snake(c), v)
                                  for c, v in args.items())

            error = None
            try:
                result = operation(**converted_args)

                if 'download' in test['act']['operation']:
                    result = Binary(result.read())
            except Exception as exc:
                error = exc

            self.init_expected_db(test, result)

            # Asserts.
            errors = {"FileNotFound": NoFile,
                      "ChunkIsMissing": CorruptGridFile,
                      "ExtraChunk": CorruptGridFile,
                      "ChunkIsWrongSize": CorruptGridFile,
                      "RevisionNotFound": NoFile}

            if test['assert'].get("error", False):
                self.assertIsNotNone(error)
                self.assertTrue(isinstance(error,
                                           errors[test['assert']['error']]))
            else:
                self.assertIsNone(error)

            if 'result' in test['assert']:
                if test['assert']['result'] == 'void':
                    test['assert']['result'] = None
                self.assertEqual(result, test['assert'].get('result'))

            if 'data' in test['assert']:
                # Create alphabetized list
                self.assertEqual(
                    set(self.sorted_list(self.db.fs.chunks, True)),
                    set(self.sorted_list(self.db.expected.chunks, True)))

                self.assertEqual(
                    set(self.sorted_list(self.db.fs.files, False)),
                    set(self.sorted_list(self.db.expected.files, False)))
Exemplo n.º 10
0
 def put(self, task, priority, schedule_time=None):
     if schedule_time is not None:
         raise SpiderMisuseError('Mongo task queue does not support delayed task') 
     item = {
         'task': Binary(pickle.dumps(task)),
         'priority': priority,
     }
     self.collection.save(item)
Exemplo n.º 11
0
def str2binary(data):
    """字符串转为二进制格式
    :param data:
    :return:
    """
    if isinstance(data, unicode):
        data = data.encode('utf-8')
    return Binary(StringIO(data).getvalue())
Exemplo n.º 12
0
 def create_plant_from_form(self, form):
     """Return a plant object from a plant form object."""
     return Plant(
         binomial=form.binomial.data,
         names=form.names.data.split(', ') if form.names.data else (),
         cultivars=form.cultivars.data.split(', ')
         if form.cultivars.data else (),
         image=Binary(form.image.data.read()) if form.image.data else None)
Exemplo n.º 13
0
 def represent(self, obj, fieldtype):
     # the base adatpter does not support MongoDB ObjectId
     if isinstance(obj, self.ObjectId):
         value = obj
     else:
         value = NoSQLAdapter.represent(self, obj, fieldtype)
     # reference types must be convert to ObjectID
     if fieldtype == 'date':
         if value is None:
             return value
         # this piece of data can be stripped off based on the fieldtype
         t = datetime.time(0, 0, 0)
         # mongodb doesn't has a date object and so it must datetime,
         # string or integer
         return datetime.datetime.combine(value, t)
     elif fieldtype == 'time':
         if value is None:
             return value
         # this piece of data can be stripped of based on the fieldtype
         d = datetime.date(2000, 1, 1)
         # mongodb doesn't has a  time object and so it must datetime,
         # string or integer
         return datetime.datetime.combine(d, value)
     elif fieldtype == "blob":
         if value is None:
             return value
         from bson import Binary
         if not isinstance(value, Binary):
             if not isinstance(value, basestring):
                 return Binary(str(value))
             return Binary(value)
         return value
     elif (isinstance(fieldtype, basestring)
           and fieldtype.startswith('list:')):
         if fieldtype.startswith('list:reference'):
             newval = []
             for v in value:
                 newval.append(self.object_id(v))
             return newval
         return value
     elif ((isinstance(fieldtype, basestring)
            and fieldtype.startswith("reference"))
           or (isinstance(fieldtype, Table)) or fieldtype == "id"):
         value = self.object_id(value)
     return value
Exemplo n.º 14
0
def checksum(symbol, doc):
    """
    Checksum the passed in dictionary
    """
    sha = hashlib.sha1()
    sha.update(symbol)
    for k in sorted(doc.iterkeys(), reverse=True):
        sha.update(str(doc[k]))
    return Binary(sha.digest())
Exemplo n.º 15
0
def save_image(image_name_resized, image_bytes):
    logger.info('Saving image %s to DB', image_name_resized)

    image_resized_file = Binary(image_bytes)

    return db.images.insert_one({
        'image_resized_name': image_name_resized,
        'image_resized_file': image_resized_file
    })
Exemplo n.º 16
0
    def test_jsonify_Binary(self):
        binary = Binary(b"hello")
        json = {'a': 1, 'bin': binary}
        safe_json = {'a': 1, 'bin': {'$binary': "aGVsbG8=", "$type": "00"}}

        jsonified_bson = jsonify(json).response
        jsonified = flask_jsonify(safe_json).response

        assert jsonified_bson == jsonified
def fileUpload(db, ssl_file, ssl_type, host):
    if ssl_type == "key":
        filename = "{}.{}".format(host, "key")
        path = "/opt/waf/conf/ssl/keys"
    else:
        filename = "{}.{}".format(host, "crt")
        path = "/opt/waf/conf/ssl/certificates"

    found = db.ssl.files.find_one({"filename": filename})
    if found and "_id" in found:
        db.ssl.files.update({"filename": filename}, {
            "$set": {
                "md5": hashlib.md5(ssl_file).hexdigest(),
                "length": len(ssl_file)
            }
        })
        db.ssl.chunks.update({"files_id": ObjectId(found["_id"])},
                             {"$set": {
                                 "data": Binary(ssl_file, 0)
                             }})
        file_id = found["_id"]
        print(colorize("green", "[+] File {} updated".format(filename)))
    else:
        file_id = db.ssl.files.insert_one({
            "filename":
            filename,
            "chunkSize":
            261120,
            "length":
            len(ssl_file),
            "md5":
            hashlib.md5(ssl_file).hexdigest(),
            "metadata": {
                "path": path,
                "is_dir": False
            }
        }).inserted_id
        db.ssl.chunks.insert_one({
            "n": 0,
            "data": Binary(ssl_file, 0),
            "files_id": file_id
        })
        print(colorize("green", "[+] File {} added to DB".format(filename)))
    return file_id
Exemplo n.º 18
0
    def to_mongo(self, value):
        pass

        return Binary(
            value.private_bytes(
                encoding=serialization.Encoding.DER,
                format=serialization.PrivateFormat.PKCS8,
                encryption_algorithm=serialization.BestAvailableEncryption(
                    b"passphrase"),
            ))
Exemplo n.º 19
0
    def put(self, task, priority, schedule_time=None):
        if schedule_time is None:
            schedule_time = datetime.utcnow()

        item = {
            'task': Binary(pickle.dumps(task)),
            'priority': priority,
            'schedule_time': schedule_time,
        }
        self.collection.save(item)
Exemplo n.º 20
0
    def create_user(self, db_connection: IConnection, username: str,
                    data: BytesIO, client_id: str):
        img = Binary(data.read())
        _id = db_connection.insert({
            'username': username,
            'image': img,
            'client_id': client_id
        })

        return _id
Exemplo n.º 21
0
 def checksum(self, from_idx, to_idx):
     if self._checksum is None:
         self._lazy_init()
         total_sha = None
         for chunk_bytes, dtype in self.generator_bytes(from_idx=from_idx, to_idx=to_idx):
             # TODO: what about compress_array here in batches?
             compressed_chunk = compress(chunk_bytes)
             total_sha = incremental_checksum(compressed_chunk, curr_sha=total_sha, is_bytes=True)
         self._checksum = Binary(total_sha.digest())
     return self._checksum
Exemplo n.º 22
0
 def str2hex(jsn):
     for key, val in jsn.items():
         if key in ("data", "source", "result"):
             if "$hex" in val:
                 jsn[key] = Binary(bytes_from_hex(val['$hex']))
         if isinstance(jsn[key], dict):
             str2hex(jsn[key])
         if isinstance(jsn[key], list):
             for k in jsn[key]:
                 str2hex(k)
    def test_reading_an_existing_key_should_read_saved_information(self):
        encoded_value = Binary(
            pickle.dumps('value', protocol=pickle.HIGHEST_PROTOCOL))

        self.db.main.insert({
            '_id': 'id:{}:key'.format(self.fake_id),
            'v': encoded_value
        })

        self.assertEqual(self.document['key'], 'value')
Exemplo n.º 24
0
    def test_match_querying_with_binary(self):
        class MyDocument(Document):
            bin_field = BinaryField()

        MyDocument.drop_collection()

        doc = MyDocument(bin_field=BIN_VALUE).save()

        matched_doc = MyDocument.objects(bin_field=Binary(BIN_VALUE)).first()
        self.assertEqual(matched_doc.id, doc.id)
Exemplo n.º 25
0
    def save_model_to_mongo(self, model: Any, trained_from: date = None, trained_upto: date = None):
        fs = connect_grid()

        if self.keras:
            with NamedTemporaryFile(suffix='.hdf5', delete=True) as ntf:
                save_model(model, ntf.name, overwrite=True)
                with BytesIO(Binary(ntf.read())) as f:
                    objectId = fs.put(f, filename=self.model_name, chunk_size=2097152)

        else:
            with BytesIO(Binary(dumps(model))) as f:
                objectId = fs.put(f, filename=self.model_name, chunk_size=2097152)

        PythonModel(
            grid_fileid=objectId,
            model_name=self.model_name,
            symbol=self.symbol,
            trained_from=trained_from,
            trained_upto=trained_upto,
        ).save()
Exemplo n.º 26
0
def convert_to_document(question):
    feature_vector = Binary(pickle.dumps(question.feature_vector, protocol=2))
    document = {
        'text': question.text,
        'answer': question.answer,
        'feature_vector': feature_vector,
        'category': question.category,
        'keywords': question.keywords,
        'morphs': question.morphs
    }
    return document
Exemplo n.º 27
0
    def update_user_image(self, db_connection: IConnection, username: str,
                          data: BytesIO, client_id: str):
        img = Binary(data.read())
        _id = db_connection.update(
            {
                'username': username,
                'client_id': client_id
            }, {'$set': {
                'image': img
            }})

        return _id
Exemplo n.º 28
0
    def create_version_dict(self, path=None, version=None):
        """Create and return a version dict.

        If a project path is given, the project folder is zipped
        and compared to the latest existing zip archive. If it
        differs, a new dict is created with a higher version number.
        """
        # Get the latest version number or 1 if it doesn't exist
        if version == None:
            version = self.get_latest_version_number()
        version_dict = self.get_version(version)
        # Create an empty dict of the manifest doesn't have one
        if version_dict == 0 or version_dict == None:
            version_dict = {}
        # If a path is given, zip it and compare to the existing hash
        if path is not None:
            now = datetime.today().strftime('%Y%m%d%H%M%S')
            version_dict['version_name'] = now + '_v' + str(
                version) + self.reduced_manifest['name']
            version_dict['version_number'] = version

            # Make sure there is a zipfile to compare
            # Zip the project path
            if not os.path.exists(self.temp_dir):
                os.makedirs(self.temp_dir)
            new_zipfile_path = self.temp_dir + '/' + version_dict[
                'version_name'] + '.zip'
            new_zipfile = zipfile.ZipFile(new_zipfile_path, 'w',
                                          zipfile.ZIP_DEFLATED)
            rootlen = len(path) + 1
            for base, _, files in os.walk(path):
                # Create local paths and write them to the new zipfile
                for file in files:
                    fn = os.path.join(base, file)
                    new_zipfile.write(fn, fn[rootlen:])
                # Compare the hashes
            if 'zipfile' in version_dict and version_dict[
                    'zipfile'] is not None:
                result = self.compare_files(version_dict['zipfile'],
                                            new_zipfile)
                # If the zipfiles are not the same iterate the version
                if result == False:
                    version = version + 1
                    version_dict['version_number'] = version
            with open(new_zipfile_path, 'rb') as f:
                version_dict['zipfile'] = Binary(f.read())
            # Now remove the temporary zipfile
            new_zipfile.close()
            os.remove(new_zipfile_path)
            # else:
            #     version_dict['zipfile'] = Binary(new_zipfile)
        return version_dict
Exemplo n.º 29
0
def checksum(symbol, doc):
    """
    Checksum the passed in dictionary
    """
    sha = hashlib.sha1()
    sha.update(symbol.encode('ascii'))
    for k in sorted(iter(doc.keys()), reverse=True):
        v = doc[k]
        if isinstance(v, six.binary_type):
            sha.update(doc[k])
        else:
            sha.update(str(doc[k]).encode('ascii'))
    return Binary(sha.digest())
Exemplo n.º 30
0
 def GET(self, name):
     data = tiles.find_one({'name': name})
     if data != None:
         return data['image']
     else:
         print 'fetch ' + name
         tmp = string.split(name, '/')
         doc = {'name': name, 'zoom': tmp[0], 'x': tmp[1], 'y': tmp[2][:-3]}
         req = requests.get(server + name)
         image_data = req.content
         doc['image'] = Binary(image_data)
         tiles.insert(doc)
         return str(req.content)