def get_insert_product_with_offer(offer) -> List[InsertOne]: return InsertOne({ **get_new_product_from_offer(offer), "offers": [offer["_id"]] })
def test_cluster_time(self): listener = SessionTestListener() # Prevent heartbeats from updating $clusterTime between operations. client = rs_or_single_client(event_listeners=[listener], heartbeatFrequencyMS=999999) collection = client.pymongo_test.collection # Prepare for tests of find() and aggregate(). collection.insert_many([{} for _ in range(10)]) self.addCleanup(collection.drop) self.addCleanup(client.pymongo_test.collection2.drop) def bulk_insert(ordered): if ordered: bulk = collection.initialize_ordered_bulk_op() else: bulk = collection.initialize_unordered_bulk_op() bulk.insert({}) bulk.execute() def rename_and_drop(): # Ensure collection exists. collection.insert_one({}) collection.rename('collection2') client.pymongo_test.collection2.drop() def insert_and_find(): cursor = collection.find().batch_size(1) for _ in range(10): # Advance the cluster time. collection.insert_one({}) next(cursor) cursor.close() def insert_and_aggregate(): cursor = collection.aggregate([], batchSize=1).batch_size(1) for _ in range(5): # Advance the cluster time. collection.insert_one({}) next(cursor) cursor.close() ops = [ # Tests from Driver Sessions Spec. ('ping', lambda: client.admin.command('ping')), ('aggregate', lambda: list(collection.aggregate([]))), ('find', lambda: list(collection.find())), ('insert_one', lambda: collection.insert_one({})), # Additional PyMongo tests. ('insert_and_find', insert_and_find), ('insert_and_aggregate', insert_and_aggregate), ('update_one', lambda: collection.update_one({}, {'$set': {'x': 1}})), ('update_many', lambda: collection.update_many({}, {'$set': {'x': 1}})), ('delete_one', lambda: collection.delete_one({})), ('delete_many', lambda: collection.delete_many({})), ('bulk_write', lambda: collection.bulk_write([InsertOne({})])), ('ordered bulk', lambda: bulk_insert(True)), ('unordered bulk', lambda: bulk_insert(False)), ('rename_and_drop', rename_and_drop), ] for name, f in ops: listener.results.clear() # Call f() twice, insert to advance clusterTime, call f() again. f() f() collection.insert_one({}) f() self.assertGreaterEqual(len(listener.results['started']), 1) for i, event in enumerate(listener.results['started']): self.assertTrue( '$clusterTime' in event.command, "%s sent no $clusterTime with %s" % ( f.__name__, event.command_name)) if i > 0: succeeded = listener.results['succeeded'][i - 1] self.assertTrue( '$clusterTime' in succeeded.reply, "%s received no $clusterTime with %s" % ( f.__name__, succeeded.command_name)) self.assertTrue( event.command['$clusterTime']['clusterTime'] >= succeeded.reply['$clusterTime']['clusterTime'], "%s sent wrong $clusterTime with %s" % ( f.__name__, event.command_name))
def article(headers2): a = 0 list = guanjianci.key_list a = a + 1 for lis in list: try: from random import choice par = dict(params) par['q'] = lis arr = [50] arrs = choice(arr) time.sleep(int(arrs)) response = ss.get('https://www.facebook.com/search/posts', params=par, headers=headers2) print(response.status_code) # if response.status_code != 200: # headers2 = headers1 # pass content = response.content.decode('utf-8') id = re.compile('"id":"vm-(.*?):').findall(str(content)) url = re.compile('"permalink":"(.*?)"').findall(str(content)) if url == []: print('进入休眠') time.sleep(7200) break for ur in url: try: urls = str(ur).replace('\\', '') # urls = 'https://www.facebook.com/groups/2337886349768125/posts/2883216531901768' arrs = choice(arr) time.sleep(int(arrs)) res = ss.get(urls, headers=headers2) article = res.content.decode('utf-8') articles = re.compile('"wwwURL":"(.*?)"').findall( str(article)) times = re.compile('"creation_time":(.*?),').findall( str(article)) likeCount = re.compile( '"reaction_count":{"count":(.*?),"').findall( str(article)) title = re.compile('"message":{"text":"(.*?)"},"').findall( str(article)) for urs, ti, like, til in zip(articles, times, likeCount, title): try: ac = '' ab = re.compile('(\\\\ud...)').findall(str(til)) for te in ab: til = til.replace(te, '') if til[-1] == '\\': til = til[:-1] tils = til.encode( 'utf-8', 'replace').decode('unicode-escape') urss = str(urs).replace('\\', '') timeArray = time.localtime(int(ti)) pubTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) arcontent = tils site = "Facebook" siteId = 1049117 pushState = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = [] data.append( InsertOne({ "url": urss, "title": tils, "pub_time": pubTime, "content": arcontent, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": urss, 'push_state': pushState, 'like_num': int(like), })) try: collection.bulk_write(data) print('添加完成') print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: print("添加重复") print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: import traceback traceback.print_exc() pass except Exception as err: import traceback traceback.print_exc() pass except Exception as err: import traceback traceback.print_exc() pass
def register(self, instance): # Required environment variables anatomy_data = instance.data["anatomyData"] io.install() context = instance.context project_entity = instance.data["projectEntity"] context_asset_name = context.data["assetEntity"]["name"] asset_name = instance.data["asset"] asset_entity = instance.data.get("assetEntity") if not asset_entity or asset_entity["name"] != context_asset_name: asset_entity = io.find_one({ "type": "asset", "name": asset_name, "parent": project_entity["_id"] }) assert asset_entity, ( "No asset found by the name \"{0}\" in project \"{1}\"" ).format(asset_name, project_entity["name"]) instance.data["assetEntity"] = asset_entity # update anatomy data with asset specific keys # - name should already been set hierarchy = "" parents = asset_entity["data"]["parents"] if parents: hierarchy = "/".join(parents) anatomy_data["hierarchy"] = hierarchy task_name = instance.data.get("task") if task_name: anatomy_data["task"] = task_name stagingdir = instance.data.get("stagingDir") if not stagingdir: self.log.info( ("{0} is missing reference to staging directory." " Will try to get it from representation.").format(instance)) else: self.log.debug( "Establishing staging directory @ {0}".format(stagingdir)) # Ensure at least one file is set up for transfer in staging dir. repres = instance.data.get("representations") assert repres, "Instance has no files to transfer" assert isinstance( repres, (list, tuple)), ("Instance 'files' must be a list, got: {0} {1}".format( str(type(repres)), str(repres))) subset = self.get_subset(asset_entity, instance) instance.data["subsetEntity"] = subset version_number = instance.data["version"] self.log.debug("Next version: v{}".format(version_number)) version_data = self.create_version_data(context, instance) version_data_instance = instance.data.get('versionData') if version_data_instance: version_data.update(version_data_instance) # TODO rename method from `create_version` to # `prepare_version` or similar... version = self.create_version(subset=subset, version_number=version_number, data=version_data) self.log.debug("Creating version ...") new_repre_names_low = [_repre["name"].lower() for _repre in repres] existing_version = io.find_one({ 'type': 'version', 'parent': subset["_id"], 'name': version_number }) if existing_version is None: version_id = io.insert_one(version).inserted_id else: # Check if instance have set `append` mode which cause that # only replicated representations are set to archive append_repres = instance.data.get("append", False) # Update version data # TODO query by _id and io.update_many( { 'type': 'version', 'parent': subset["_id"], 'name': version_number }, {'$set': version}) version_id = existing_version['_id'] # Find representations of existing version and archive them current_repres = list( io.find({ "type": "representation", "parent": version_id })) bulk_writes = [] for repre in current_repres: if append_repres: # archive only duplicated representations if repre["name"].lower() not in new_repre_names_low: continue # Representation must change type, # `_id` must be stored to other key and replaced with new # - that is because new representations should have same ID repre_id = repre["_id"] bulk_writes.append(DeleteOne({"_id": repre_id})) repre["orig_id"] = repre_id repre["_id"] = io.ObjectId() repre["type"] = "archived_representation" bulk_writes.append(InsertOne(repre)) # bulk updates if bulk_writes: io._database[io.Session["AVALON_PROJECT"]].bulk_write( bulk_writes) version = io.find_one({"_id": version_id}) instance.data["versionEntity"] = version existing_repres = list( io.find({ "parent": version_id, "type": "archived_representation" })) instance.data['version'] = version['name'] intent_value = instance.context.data.get("intent") if intent_value and isinstance(intent_value, dict): intent_value = intent_value.get("value") if intent_value: anatomy_data["intent"] = intent_value anatomy = instance.context.data['anatomy'] # Find the representations to transfer amongst the files # Each should be a single representation (as such, a single extension) representations = [] destination_list = [] if 'transfers' not in instance.data: instance.data['transfers'] = [] template_name = self.template_name_from_instance(instance) published_representations = {} for idx, repre in enumerate(instance.data["representations"]): published_files = [] # create template data for Anatomy template_data = copy.deepcopy(anatomy_data) if intent_value is not None: template_data["intent"] = intent_value resolution_width = repre.get("resolutionWidth") resolution_height = repre.get("resolutionHeight") fps = instance.data.get("fps") if resolution_width: template_data["resolution_width"] = resolution_width if resolution_width: template_data["resolution_height"] = resolution_height if resolution_width: template_data["fps"] = fps files = repre['files'] if repre.get('stagingDir'): stagingdir = repre['stagingDir'] if repre.get("outputName"): template_data["output"] = repre['outputName'] template = os.path.normpath( anatomy.templates[template_name]["path"]) sequence_repre = isinstance(files, list) repre_context = None if sequence_repre: self.log.debug("files: {}".format(files)) src_collections, remainder = clique.assemble(files) self.log.debug("src_tail_collections: {}".format( str(src_collections))) src_collection = src_collections[0] # Assert that each member has identical suffix src_head = src_collection.format("{head}") src_tail = src_collection.format("{tail}") # fix dst_padding valid_files = [x for x in files if src_collection.match(x)] padd_len = len(valid_files[0].replace(src_head, "").replace( src_tail, "")) src_padding_exp = "%0{}d".format(padd_len) test_dest_files = list() for i in [1, 2]: template_data["representation"] = repre['ext'] template_data["frame"] = src_padding_exp % i anatomy_filled = anatomy.format(template_data) template_filled = anatomy_filled[template_name]["path"] if repre_context is None: repre_context = template_filled.used_values test_dest_files.append(os.path.normpath(template_filled)) template_data["frame"] = repre_context["frame"] self.log.debug("test_dest_files: {}".format( str(test_dest_files))) dst_collections, remainder = clique.assemble(test_dest_files) dst_collection = dst_collections[0] dst_head = dst_collection.format("{head}") dst_tail = dst_collection.format("{tail}") index_frame_start = None if repre.get("frameStart"): frame_start_padding = int(anatomy.templates["render"].get( "frame_padding", anatomy.templates["render"].get("padding"))) index_frame_start = int(repre.get("frameStart")) # exception for slate workflow if index_frame_start and "slate" in instance.data["families"]: index_frame_start -= 1 dst_padding_exp = src_padding_exp dst_start_frame = None for i in src_collection.indexes: # TODO 1.) do not count padding in each index iteration # 2.) do not count dst_padding from src_padding before # index_frame_start check src_padding = src_padding_exp % i src_file_name = "{0}{1}{2}".format(src_head, src_padding, src_tail) dst_padding = src_padding_exp % i if index_frame_start: dst_padding_exp = "%0{}d".format(frame_start_padding) dst_padding = dst_padding_exp % index_frame_start index_frame_start += 1 dst = "{0}{1}{2}".format(dst_head, dst_padding, dst_tail).replace("..", ".") self.log.debug("destination: `{}`".format(dst)) src = os.path.join(stagingdir, src_file_name) self.log.debug("source: {}".format(src)) instance.data["transfers"].append([src, dst]) published_files.append(dst) # for adding first frame into db if not dst_start_frame: dst_start_frame = dst_padding # Store used frame value to template data template_data["frame"] = dst_start_frame dst = "{0}{1}{2}".format(dst_head, dst_start_frame, dst_tail).replace("..", ".") repre['published_path'] = dst else: # Single file # _______ # | |\ # | | # | | # | | # |_______| # template_data.pop("frame", None) fname = files assert not os.path.isabs(fname), ( "Given file name is a full path") template_data["representation"] = repre['ext'] src = os.path.join(stagingdir, fname) anatomy_filled = anatomy.format(template_data) template_filled = anatomy_filled[template_name]["path"] repre_context = template_filled.used_values dst = os.path.normpath(template_filled).replace("..", ".") instance.data["transfers"].append([src, dst]) published_files.append(dst) repre['published_path'] = dst self.log.debug("__ dst: {}".format(dst)) repre["publishedFiles"] = published_files for key in self.db_representation_context_keys: value = template_data.get(key) if not value: continue repre_context[key] = template_data[key] # Use previous representation's id if there are any repre_id = None repre_name_low = repre["name"].lower() for _repre in existing_repres: # NOTE should we check lowered names? if repre_name_low == _repre["name"]: repre_id = _repre["orig_id"] break # Create new id if existing representations does not match if repre_id is None: repre_id = io.ObjectId() representation = { "_id": repre_id, "schema": "pype:representation-2.0", "type": "representation", "parent": version_id, "name": repre['name'], "data": { 'path': dst, 'template': template }, "dependencies": instance.data.get("dependencies", "").split(), # Imprint shortcut to context # for performance reasons. "context": repre_context } if repre.get("outputName"): representation["context"]["output"] = repre['outputName'] if sequence_repre and repre.get("frameStart"): representation['context']['frame'] = ( dst_padding_exp % int(repre.get("frameStart"))) self.log.debug("__ representation: {}".format(representation)) destination_list.append(dst) self.log.debug("__ destination_list: {}".format(destination_list)) instance.data['destination_list'] = destination_list representations.append(representation) published_representations[repre_id] = { "representation": representation, "anatomy_data": template_data, "published_files": published_files } self.log.debug("__ representations: {}".format(representations)) # Remove old representations if there are any (before insertion of new) if existing_repres: repre_ids_to_remove = [] for repre in existing_repres: repre_ids_to_remove.append(repre["_id"]) io.delete_many({"_id": {"$in": repre_ids_to_remove}}) self.log.debug("__ representations: {}".format(representations)) for rep in instance.data["representations"]: self.log.debug("__ represNAME: {}".format(rep['name'])) self.log.debug("__ represPATH: {}".format(rep['published_path'])) io.insert_many(representations) instance.data["published_representations"] = ( published_representations) # self.log.debug("Representation: {}".format(representations)) self.log.info("Registered {} items".format(len(representations)))
def my_job(): # menuId = ['http://www.ccdi.gov.cn/ldhd/gcsy/', 'http://www.ccdi.gov.cn/ldhd/wbld/', # 'http://www.ccdi.gov.cn/xxgk/hyzl/', 'http://www.ccdi.gov.cn/yaowen/', 'http://www.ccdi.gov.cn/pl/', # 'http://www.ccdi.gov.cn/gzdt/jdjc/', 'http://www.ccdi.gov.cn/gzdt/dfzf/', 'http://www.ccdi.gov.cn/xsxc/', # 'http://www.ccdi.gov.cn/gzdt/zzjs/', 'http://www.ccdi.gov.cn/gzdt/gjhz/', # 'http://www.ccdi.gov.cn/gzdt/jcfc/', 'http://www.ccdi.gov.cn/lswh/wenhua/', # 'http://www.ccdi.gov.cn/lswh/lilun/', 'http://www.ccdi.gov.cn/scdc/zggb/zjsc/', # 'http://www.ccdi.gov.cn/scdc/zggb/djcf/', 'http://www.ccdi.gov.cn/scdc/zyyj/zjsc/', # 'http://www.ccdi.gov.cn/scdc/zyyj/djcf/', 'http://www.ccdi.gov.cn/scdc/sggb/zjsc/', # 'http://www.ccdi.gov.cn/scdc/sggb/djcf/'] menuId = [1, 2, 3, 4] for chl in menuId: paramssss = dict(paramsss) paramssss['page'] = chl canshushijian = time.strftime("%Y.%m.%d", time.localtime()) paramssss[ 'was_custom_expr'] = "'((的)) AND (DocRelTime = (\' " + canshushijian + "'\'))'" # location = os.getcwd() + '/fake_useragent.json' # ua = fake_useragent.UserAgent(path=location) # print(ua.random) # headers['User-Agent'] = ua.random print('111') try: # pro = ips() # ss.proxies = pro response = ss.post( 'https://www.ccdi.gov.cn/was5/web/search', timeout=10, headers=headersss, params=paramssss, ) print(response.status_code) content = response.content.decode('utf-8') a = re.compile("<a href='(.*?)' target=\"_blank\">").findall( str(content)) sj = """5,6,7,8,9""" sjs = set(sj.split(',')) print("444") for sjss in sjs: sjs = sjss time.sleep(int(sjs)) aaaa = 0 for ac in a: str2 = filter(str.isdigit, ac) # str2为filter对象 str3 = list(str2) num = "".join(str3) # location = os.getcwd() + '/fake_useragent.json' # ua = fake_useragent.UserAgent(path=location) headers[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.84' url = ac try: res = ss.get(url, headers=headerss, timeout=10, cookies=cookiess, verify=False) acontent = res.content.decode('utf-8') articleContent = re.compile( '<div class="TRS_Editor">([\s\S]*?)</div>').findall( str(acontent)) if articleContent == []: for neirong in range(10): # location = os.getcwd() + '/fake_useragent.json' # ua = fake_useragent.UserAgent(path=location) # headers['User-Agent'] = ua.random res = ss.get(url, headers=headers, timeout=10, cookies=cookies, verify=False) acontent = res.content.decode('utf-8') articleContent = re.compile( '<div class="TRS_Editor">([\s\S]*?)</div>' ).findall(str(acontent)) articleContents = re.compile( '<div class="content">([\s\S]*?)</div>' ).findall(str(acontent)) if articleContents or articleContent: if articleContents: articleContent = articleContents break else: break else: print() title = re.compile( '<h2 class="tit">([\s\S]*?)</h2>').findall( str(acontent)) print(url) pubTime = re.compile('发布时间:([\s\S]*?)</em>').findall( str(acontent)) pubTimes = datetime.datetime.strptime( pubTime[0], "%Y-%m-%d %H:%M") imglist = re.compile( '<img [\s\S]*?." src="([\s\S]*?.)" />').findall( str(articleContent[0])) strs = articleContent[0] for im in imglist: ims = im[4:10] # chls = chl+ims+'/' # strs = strs.replace('\\', '') # 补全图片地址 # ims = im.replace('\\','') c = urllib.parse.urljoin(url, im) # 替换成已经补全的图片地址 # content = re.sub(i, c, contentText) strs = strs.replace(im, c) if articleContent: site = "中央纪律检查委员会" site_id = 1049418 push_state = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') datass = [] datass.append( InsertOne({ "url": url, "title": title[0], "aid": num, "content": strs, "site": site, "pub_time": pubTimes, "push_state": push_state, "site_id": site_id, "download_Time": downloadTime })) insertdb(datass) else: pass except Exception as err: traceback.print_exc() pass except Exception as err: traceback.print_exc() pass
{'$set': { 'poll': row['Poll'] }})) person = people.find_one({ 'addressId': address['_id'], 'name': row['Name'] }) rowVoted = row['Voted'] == 'Y' if person is None: peopleWrites.append( InsertOne({ 'name': row['Name'], 'addressId': address['_id'], 'voted': rowVoted, 'created': datetime.datetime.utcnow(), 'updated': datetime.datetime.utcnow() })) else: voted = person.get('Voted') if voted is None or voted != rowVoted: peopleWrites.append( UpdateOne({'_id': person['_id']}, { '$set': { 'voted': rowVoted, 'updated': datetime.datetime.utcnow() } })) if len(addressWrites) > 0:
# Replace XXXX with your connection URI from the Atlas UI client = MongoClient("mongodb+srv://analytics:[email protected]/test?retryWrites=true&w=majority") people_raw = client.cleansing['people-raw'] batch_size = 1000 inserts = [] count = 0 # Instead of updating one document at a time, we will add the current update # to a batch of updates, and when the current batch size reaches the batch # size limit, send the batch updates to the server at once. with open("./people-raw.json") as dataset: for line in dataset: inserts.append(InsertOne(loads(line))) count += 1 if count == batch_size: people_raw.bulk_write(inserts) inserts = [] count = 0 if inserts: people_raw.bulk_write(inserts) count = 0 # Confirm that 50,474 documents are in your collection before moving on people_raw.count()
continue for item in fed["banned"].items(): user_id = item[0] ban = item[1] new = { "fed_id": fed["fed_id"], "user_id": user_id, "by": ban["by"], "time": ban["time"], } if "reason" in ban: new["reason"] = ban["reason"] if "banned_chats" in ban: new["banned_chats"] = ban["banned_chats"] queue.append(InsertOne(new)) mongodb.fed_bans.bulk_write(queue) mongodb.feds.update_one({"fed_id": fed["fed_id"]}, {"$unset": { "banned": 1 }}) changed_feds += 1 log.info("Update done!") log.info("Modified feds - " + str(changed_feds)) log.info("Unchanged feds - " + str(all_feds_count - changed_feds))
def handle(self, *args, **options): # Establish MongoDB connection client = settings.MONGO_CLIENT db = client[options['database']] exam_collection = db.get_collection(options['exam_collection']) tag_collection = db.get_collection(options['tag_collection']) # Test whether the uniqueness constraint is defined, create it if not (this will only happen when collection # first created) if not exam_collection.index_information().get('exam_uniqueness_constraint', None): exam_collection.create_index([ ('exam_id', DESCENDING), ('revision', DESCENDING), ], unique=True, name="exam_uniqueness_constraint") scanners = options['scanners'] years = options['years'] months = options['months'] days = options['days'] parsed_data_path = Path(options['data']) if not scanners: scanner_paths = [scanner_path for scanner_path in parsed_data_path.iterdir() if scanner_path.is_dir()] else: scanner_paths = [scanner_path for scanner_path in parsed_data_path.iterdir() if (scanner_path.is_dir() and (scanner_path.name in scanners))] for scanner_path in sorted(scanner_paths): if not years: year_paths = [year_path for year_path in scanner_path.iterdir() if year_path.is_dir()] else: year_paths = [year_path for year_path in scanner_path.iterdir() if (year_path.is_dir() and (year_path.name in years))] for year_path in sorted(year_paths): if not months: month_paths = [month_path for month_path in year_path.iterdir() if month_path.is_dir()] else: month_paths = [month_path for month_path in year_path.iterdir() if (month_path.is_dir() and (month_path.name in months))] for month_path in sorted(month_paths): if not days: day_paths = [day_path for day_path in month_path.iterdir() if day_path.is_dir()] else: day_paths = [day_path for day_path in month_path.iterdir() if (day_path.is_dir() and (day_path.name in days))] for day_path in sorted(day_paths): for exam_id in sorted([e for e in day_path.iterdir() if e.is_dir()]): for pt_dir in sorted([p for p in exam_id.iterdir() if p.is_dir()]): for session_dir in sorted([s for s in pt_dir.iterdir() if s.is_dir()]): tags_to_create = [] study_metadata_files = list(session_dir.glob("study_*_metadata.txt")) if not study_metadata_files: self.stdout.write("Error: No study metadata found in {}".format(session_dir)) continue if len(study_metadata_files) > 1: self.stdout.write("Error: Multiple study metadata " "files for {} found".format(session_dir)) continue study_meta_file = study_metadata_files[0] if not study_meta_file.is_file(): self.stdout.write("Error: Cannot load file {}".format(study_meta_file)) continue self.stdout.write("Loading data from {}".format(str(study_meta_file))) try: with open(str(study_meta_file), "rt") as sm: study_metadata = json.load(sm) except ValueError: self.stdout.write("Error: Cannot load file {}".format(study_meta_file)) continue metadata = study_metadata['metadata'] data = study_metadata['data'] dicom_data = None for subdir in data: if subdir.get('dicom_data', None): dicom_data = subdir['dicom_data'] break if not dicom_data: self.stdout.write("Error: No DICOM metadata " "for exam {}".format(study_meta_file)) continue try: exam_id = metadata['exam_id'] revision = 1 parser_version = metadata['parser_version'] filepath = metadata['gold_fpath'] checksum = metadata['gold_archive_checksum'] except KeyError: self.stdout.write("Error: Required metadata field not " "available for exam {}".format(study_meta_file)) continue try: station_name = get_fmrif_scanner(dicom_data["00081010"]["Value"][0]) except (KeyError, IndexError): station_name = None if not station_name: station_name = filepath.split("/")[0] try: study_instance_uid = dicom_data["0020000D"]['Value'][0] except (KeyError, IndexError): study_instance_uid = None try: study_id = dicom_data["00200010"]['Value'][0] except (KeyError, IndexError): study_id = None try: study_date = dicom_data["00080020"]['Value'][0] study_date = datetime.strptime(study_date, '%Y%m%d').date() except (KeyError, IndexError): study_date = None if not study_date: year, month, day = filepath.split("/")[1:4] study_date = "{}{}{}".format(year, month, day) study_date = datetime.strptime(study_date, '%Y%m%d').date() try: study_time = dicom_data["00080030"]['Value'][0] if "." in study_time: study_time = datetime.strptime(study_time, '%H%M%S.%f').time() else: study_time = datetime.strptime(study_time, '%H%M%S').time() except (KeyError, IndexError): study_time = None if study_time: study_datetime = datetime.combine(study_date, study_time) else: study_datetime = datetime.combine(study_date, datetime_time.min) try: study_description = dicom_data["00081030"]['Value'][0] except (KeyError, IndexError): study_description = None protocol = None # Not implemented yet try: accession_number = dicom_data["00080050"]['Value'][0] except (KeyError, IndexError): accession_number = None try: name = dicom_data["00100010"]['Value'][0]['Alphabetic'] except (KeyError, IndexError): name = None if name: name_fields = parse_pn(name) last_name = name_fields['family_name'] first_name = name_fields['given_name'] else: first_name, last_name = None, None try: patient_id = dicom_data["00100020"]['Value'][0] except (KeyError, IndexError): patient_id = None try: sex = dicom_data["00100040"]['Value'][0] except (KeyError, IndexError): sex = None try: birth_date = dicom_data["00100030"]['Value'][0] birth_date = datetime.strptime(birth_date, '%Y%m%d') except (KeyError, IndexError): birth_date = None new_exam = { 'exam_id': exam_id, 'revision': revision, 'parser_version': parser_version, 'filepath': filepath, 'checksum': checksum, 'station_name': station_name, 'study_instance_uid': study_instance_uid, 'study_id': study_id, 'study_datetime': study_datetime, 'study_description': study_description, 'protocol': protocol, 'accession_number': accession_number, 'name': name, 'last_name': last_name, 'first_name': first_name, 'patient_id': patient_id, 'sex': sex, 'birth_date': birth_date, } new_exam_id = exam_collection.insert_one(new_exam).inserted_id study_data = study_metadata['data'] mr_scans = [] for subdir in study_data: if subdir.get('dicom_data', None): mr_scans.append(subdir) self.stdout.write("Found {} mr scans".format(len(mr_scans))) for scan in mr_scans: try: scan_dicom_data = scan['dicom_data'] scan_name = scan['metadata']['gold_scan_dir'] except KeyError: self.stdout.write("Error: Missing mandatory scan metadata, " "omitting scan from exam {}".format(study_meta_file)) continue for tag, attr in scan_dicom_data.items(): vr = attr.get('vr', None) if not vr: self.stdout.write( "WARNING: No VR found for tag {} in scan {} " "of study {}. Skipping.".format(tag, scan_name, study_meta_file)) continue if vr in ['OB', 'OD', 'OF', 'OL', 'OV', 'OW', 'SQ', 'UN']: self.stdout.write( "WARNING: Tag encoding of type B64 or JSON not supported " "for querying purposes - Tag {} in scan {} " "of study {}. Skipping.".format(tag, scan_name, study_meta_file)) continue try: new_tag = self.parse_attribute(new_exam_id, tag, scan_name, attr) tags_to_create.append(InsertOne(new_tag)) except AttributeError: self.stdout.write( "Attribute value exceeds indexable size. Skipping. Tag {} in " "scan of study {}".format(tag, scan_name, study_meta_file) ) try: res = tag_collection.bulk_write(tags_to_create) self.stdout.write("Inserted {} tags to collection".format(res.inserted_count)) except PyMongoError as e: self.stdout.write("Error: Unable to insert scan documents " "for day ".format(day_path)) self.stdout.write(e) self.stdout.write(traceback.format_exc())
def my_job(ac): try: while (ac <= 10): proxy = {} paramss = dict(params) if ac == 0: paramss['start'] = "" else: paramss['start'] = ac * 50 ac = ac + 1 agentUrl = "http://47.96.91.228:82/get/" res = requests.get(agentUrl) agenContent = res.content.decode("utf-8") dataip = re.compile('"proxy": "(.*?)",').findall(str(agenContent)) ip = dataip[0] proxy = { 'http://:': ip, } ss.proxies = proxy response = ss.get('https://www.douban.com/group/', headers=headers, params=paramss, cookies=cookies) htmlContent = response.content.decode("utf-8") urlList = re.compile( '<a href="https://www.douban.com/group/topic/(.*?)/"').findall( str(htmlContent)) for i in urlList: url = "https://www.douban.com/group/topic/" + i time.sleep(10) urlResponse = ss.get(url, headers=headers, cookies=cookies) bs = BeautifulSoup(urlResponse.content, 'html.parser', from_encoding='utf-8') htmlContents = urlResponse.content.decode("utf-8") title = re.compile( '<h1>(\s|[\r\n])*(.*?)(\s|[\r\n])*</h1>').findall( str(htmlContents)) for ti in title: titles = ti[1] pubTimes = re.compile('"dateCreated": "(.*?)",').findall( str(htmlContents)) pubTimes = pubTimes[0] pubTimes = pubTimes.replace("T", " ") aa = bs.select('div.rich-content.topic-richtext') downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') site = "豆瓣" siteId = 1044573 data = [] articleStatue = 0 articleContent = aa[0] articleContent = str(articleContent) data.append( InsertOne({ "url": url, "title": titles, "pub_time": pubTimes, "content": articleContent, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": i, 'push_state': articleStatue })) insertdb(data) except Exception as err: traceback.print_exc() pass
def reducer(self, key, values): """ Cleans the billing data: -checks gaps -checks overlappings -generates daily dataframe -checks outliers -saves results to RAW DATA and HBASE :param key: the device :param values: the information :return: """ #create dataframe with the values: df = pd.DataFrame.from_records(values, columns=["ts_ini", "ts_end", "value", "energytype", "source"]) # group it by source and energyType source_group = df.groupby('source') for source, df_source_group in source_group: etype_group = df_source_group.groupby('energytype') for etype, df_etype_group in etype_group: df_etype_group = df_etype_group.dropna(subset=["ts_ini"]) df_etype_group = df_etype_group.set_index('ts_ini') df_etype_group = df_etype_group.sort_index() df_etype_group['ts_ini'] = df_etype_group.index # save billing information in raw_data raw_data = df_etype_group[["ts_ini", "ts_end", "value"]].to_dict('records') for r in raw_data: r.update({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"}) ops = [InsertOne(x) for x in raw_data] result = self.mongo['raw_data'].bulk_write( [ DeleteMany({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"}), ] + ops ) # self.mongo['raw_data'].update({"device": key, "source": source, "energy_type": etype, "data_type": "billing"}, {'$set': { # "device": key, "source": source, "energy_type": etype, "companyId": self.companyId, # "raw_data":df_etype_group[["ts_ini","ts_end","value"]].to_dict('records') # } # }, upsert=True) # # generate daily dataframe dividing by days: dfs = [] for row in df_etype_group.iterrows(): index = pd.date_range(row[1]['ts_ini'], row[1]['ts_end']) if index.empty: continue df_temp = pd.DataFrame( data={"index": index, "value": [(float(row[1]['value']) / float(len(index)))] * len(index)}, index=index) dfs.append(df_temp) #join daily df and detect overlapings and gaps if not dfs: continue global_df = dfs[0] overlappings = [] for df_temp in dfs[1:]: overlappings.extend(global_df.index.intersection(df_temp.index).tolist()) global_df = global_df.append(df_temp) global_df.drop_duplicates(keep='last', inplace=True) gaps = [] gap_last_index = global_df[global_df.index.to_series().diff() > pd.Timedelta('1 days')].index.tolist() for gf in gap_last_index: index = list(global_df.index).index(gf) gi = list(global_df.index)[index-1] gaps.append([gi,gf]) #max_threshold = self.config['max_threshold'][etype] * 24 if 'etype' in self.config['max_threshold'] else self.config['max_threshold']['default'] * 24 #max_outliers_bool = dc.detect_max_threshold_outliers(global_df['value'], max_threshold) #global_df['value'] = dc.clean_series(global_df['value'], max_outliers_bool) negative_values_bool = dc.detect_min_threshold_outliers(global_df['value'], 0) global_df['value'] = dc.clean_series(global_df['value'], negative_values_bool) #znorm_bool = dc.detect_znorm_outliers(global_df['value'], 30, mode="global") #global_df['value'] = dc.clean_series(global_df['value'], znorm_bool) #max_outliers = list(global_df[max_outliers_bool].index) negative_outliers = list(global_df[negative_values_bool].index) #znorm_outliers = list(global_df[znorm_bool].index) clean_data = global_df.to_dict('records') for r in clean_data: r.update({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"}) ops = [InsertOne(x) for x in clean_data] result = self.mongo['clean_data'].bulk_write( [ DeleteMany({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"}), ] + ops ) self.mongo['data_quality'].update({"device": key, "source": source, "energy_type": etype, "data_type": "billing", "freq": "D"}, {"$set": { "overlapings" : overlappings, "gaps": gaps, "negative_values": negative_outliers } }, upsert=True) for row in global_df.iterrows(): yield None, "\t".join([str(row[1]['index'].timestamp()), key, str(row[1]['value']), etype, source])
def insert_one(self, document): self._batch.append(InsertOne(to_mongo(document)))
} }, { "$sort": SON([("total", -1)]) }] cursor = input_collection.aggregate(pipeline) for record in cursor: sensor = {} sensor['measurement_id'] = record['_id']['measurement_id'] sensor['loc'] = record['_id']['loc'] sensor['description'] = record['_id']['description'] sensor['source'] = record['_id']['source'] sensor['gmt_s_date'] = gmt_s_date sensor['gmt_e_date'] = gmt_e_date sensors.append(InsertOne(sensor)) if len(sensors) > 0: #delete old sensor records in input collection query = {} cursor_count = output_collection.find(query).count() if cursor_count > 0: logging.info( "Found {} records in collection {} from {}".format( cursor_count, output_collection.name, sensorSource)) output_collection.remove(query) logging.info( "Deleted {} sensors from collection {}".format( sensorSource, output_collection.name)) #store the sensors into the output collection
def sync_labels(self, labels): # Create labels bulk = [] l_coll = self.mongo_db["labels"] current_labels = {ll["name"]: ll["_id"] for ll in l_coll.find()} for label in labels: if label in current_labels: bulk += [ UpdateOne( {"_id": current_labels[label]}, {"$set": {setting: True for setting in labels[label]}}, ) ] else: doc = { # "_id": bson.ObjectId(), "name": label, "description": "", "bg_color1": 8359053, "fg_color1": 16777215, "bg_color2": 8359053, "fg_color2": 16777215, "is_protected": False, # Label scope "enable_agent": False, "enable_service": False, "enable_serviceprofile": False, "enable_managedobject": False, "enable_managedobjectprofile": False, "enable_administrativedomain": False, "enable_authprofile": False, "enable_commandsnippet": False, "enable_commandsnippet": False, # "enable_allocationgroup": False, "enable_networksegment": False, "enable_object": False, "enable_objectmodel": False, "enable_platform": False, "enable_resourcegroup": False, "enable_sensorprofile": False, # CRM "enable_subscriber": False, "enable_subscriberprofile": False, "enable_supplier": False, "enable_supplierprofile": False, # DNS "enable_dnszone": False, "enable_dnszonerecord": False, # IPAM "enable_ipaddress": False, "enable_addressprofile": False, "enable_ipaddressrange": False, "enable_ipprefix": False, "enable_prefixprofile": False, "enable_vrf": False, "enable_vrfgroup": False, # VC "enable_vc": False, "enable_vlan": False, "enable_vlanprofile": False, "enable_vpn": False, "enable_vpnprofile": False, # Exposition scope "expose_metric": False, "expose_datastream": False, } for setting in labels[label]: doc[setting] = True bulk += [InsertOne(doc)] if bulk: l_coll.bulk_write(bulk, ordered=True)
def flush(self): """Insert all buffered records into the Mongo collection. Note: Log records are inserted in chronological order into the database. The first insert failure that occurs aborts the remaining insert operations. All log records inserted successfully will be removed from the buffer. """ self.acquire() try: if not self.buffer: return bulk_result = self.client[self.database][ self.collection].bulk_write( [ InsertOne({ 'datetime': datetime.utcfromtimestamp(record.created), 'processName': record.processName, 'processId': record.process, 'threadName': record.threadName, 'threadId': record.thread, 'pathname': record.pathname, 'filename': record.filename, 'module': record.module, 'funcName': record.funcName, 'lineno': record.lineno, 'msg': record.msg, 'levelname': record.levelname, 'levelno': record.levelno, # 'funcargs': record.args, }) for record in self.buffer ], ordered=True) if not bulk_result.acknowledged: # Handle error here. pass self.buffer[:bulk_result.inserted_count] = [] except pymongo.errors.BulkWriteError as bwe: # Handle the exception here # Error details can be found in bwe._OperationFailure__details self.buffer[:bwe.details.get('nInserted', 0)] = [] except pymongo.errors.ConnectionFailure as cf: # Handle the exception here pass finally: self.release()
from pymongo import InsertOne, MongoClient from notes.config import CONN_URI BATCH_SIZE = 1000 # Batch size for batch insertion cli = MongoClient(CONN_URI) people_raw = cli.cleansing['people-raw'] batch_insertions = [] with open('people-raw.json') as f: for line in f: line_dict = bson.json_util.loads(line) # Instead of inserting one document at a time, we will add the current # insertion to a batch of insertions, and when the current batch size # reaches the batch size limit, at once send the batch insertions to the # server. batch_insertions.append(InsertOne(line_dict)) if len(batch_insertions) == BATCH_SIZE: people_raw.bulk_write(batch_insertions) print(f'Finished inserting a batch of {BATCH_SIZE} documents') batch_insertions = [] # Take care of the last batch of insertions if batch_insertions: people_raw.bulk_write(batch_insertions) print(f'Finished inserting a last batch of {len(batch_insertions)} ' f'documents') print('Finished all the insertions.')
def process_item(self, item, spider): col = self.db.mDoubaninfo min = InsertOne(dict(item)) col.bulk_write([min]) return item
async def async_update_matches_by_protocol_no(matchengine: MatchEngine, protocol_no: str): """ Update trial matches by diff'ing the newly created trial matches against existing matches in the db. Delete matches by adding {is_disabled: true} and insert all new matches. """ matches_by_sample_id = matchengine.matches.get(protocol_no, dict()) updated_time = datetime.datetime.now() for matches in matches_by_sample_id.values(): for match in matches: match['_updated'] = updated_time if protocol_no not in matchengine.matches or protocol_no not in matchengine._trials_to_match_on: log.info( f"{matchengine.match_criteria_transform.trial_collection} {protocol_no} was not matched on, not updating {matchengine.match_criteria_transform.trial_collection} matches" ) if not matchengine.skip_run_log_entry: matchengine.task_q.put_nowait(RunLogUpdateTask(protocol_no)) await matchengine.task_q.join() return log.info(f"Updating matches for {protocol_no}") if not matchengine.drop: # If no matches are found, disable all match records by sample id if not matchengine.matches[protocol_no]: for chunk in chunk_list( list(matchengine. clinical_ids_for_protocol_cache[protocol_no]), matchengine.chunk_size): matchengine.task_q.put_nowait( UpdateTask([ UpdateMany(filter={ matchengine.match_criteria_transform.match_trial_link_id: protocol_no, 'clinical_id': { '$in': chunk } }, update={ '$set': { "is_disabled": True, '_updated': updated_time } }) ], protocol_no)) else: # Get matches to disable and issue queries matches_to_disable = await get_all_except(matchengine, protocol_no, matches_by_sample_id) delete_ops = await get_delete_ops(matches_to_disable, matchengine) matchengine.task_q.put_nowait(UpdateTask(delete_ops, protocol_no)) for sample_id in matches_by_sample_id.keys(): if not matchengine.drop: new_matches_hashes = [ match['hash'] for match in matches_by_sample_id[sample_id] ] # get existing matches in db with identical hashes to newly found matches existing = await get_existing_matches(matchengine, new_matches_hashes) existing_hashes = {result['hash'] for result in existing} disabled = { result['hash'] for result in existing if result['is_disabled'] } # insert new matches if they don't already exist. disable everything else matches_to_insert = get_matches_to_insert(matches_by_sample_id, existing_hashes, sample_id) matches_to_disable = await get_matches_to_disable( matchengine, new_matches_hashes, protocol_no, sample_id) # flip is_disabled flag if a new match generated during run matches hash of an existing matches_to_mark_available = [ m for m in matches_by_sample_id[sample_id] if m['hash'] in disabled ] ops = get_update_operations(matches_to_disable, matches_to_insert, matches_to_mark_available, matchengine) else: ops = [ InsertOne(document=trial_match) for trial_match in matches_by_sample_id[sample_id] ] matchengine.task_q.put_nowait(UpdateTask(ops, protocol_no)) if not matchengine.skip_run_log_entry: matchengine.task_q.put_nowait(RunLogUpdateTask(protocol_no)) await matchengine.task_q.join()
def Insert_Data(data): insert_datas = [] insert_datas.append(InsertOne(data)) my_collection.bulk_write(insert_datas) insert_datas.clear()
def sync_facts(self): """ Retrieve known facts and synchronize with database """ self.logger.debug("Synchronizing facts") # Get facts from CLIPS self.logger.debug("Extracting facts") e_facts = {} # uuid -> fact try: f = self.env.InitialFact() except clips.ClipsError: return # No facts while f: if f.Template and f.Template.Name in self.templates: self.facts[f.Index] = f args = {} for k in f.Slots.keys(): v = f.Slots[k] if v == clips.Nil: v = None args[str(k)] = v fi = self.fcls[f.Template.Name](**args) e_facts[self.get_fact_uuid(fi)] = fi f = f.Next() # Get facts from database now = datetime.datetime.now() collection = ObjectFact._get_collection() bulk = [] new_facts = set(e_facts) for f in collection.find({"object": self.object.id}): if f["_id"] in e_facts: fact = e_facts[f["_id"]] f_attrs = self.get_fact_attrs(fact) if f_attrs != f["attrs"]: # Changed facts self.logger.debug("Fact %s has been changed: %s -> %s", f["_id"], f["attrs"], f_attrs) bulk += [ UpdateOne( {"_id": f["_id"]}, { "$set": { "attrs": f_attrs, "changed": now, "label": smart_text(fact) } }, ) ] new_facts.remove(f["_id"]) else: # Removed fact self.logger.debug("Fact %s has been removed", f["_id"]) bulk += [DeleteOne({"_id": f["_id"]})] # New facts for f in new_facts: fact = e_facts[f] f_attrs = self.get_fact_attrs(fact) self.logger.debug("Creating fact %s: %s", f, f_attrs) bulk += [ InsertOne({ "_id": f, "object": self.object.id, "cls": fact.cls, "label": smart_text(fact), "attrs": f_attrs, "introduced": now, "changed": now, }) ] if bulk: self.logger.debug("Commiting changes to database") try: collection.bulk_write(bulk) self.logger.debug("Database has been synced") except BulkWriteError as e: self.logger.error("Bulk write error: '%s'", e.details) self.logger.error("Stopping check") else: self.logger.debug("Nothing changed")
def my_job(): for iu in range(126): iu = iu + 1 paramss = dict(params) paramss['page'] = iu response = requests.get('https://m.dutenews.com/index/ajax/content', headers=headers, params=paramss, cookies=cookies) content = response.content.decode('unicode-escape') contents = str(content) url = re.compile('getUrl\((.*?),\)').findall(contents) url = list(set(url)) for i in url: try: urls = 'https://plus.dutenews.com/api/v2/feeds/' + i + '/comments' # urls = 'https://plus.dutenews.com/api/v2/feeds/17639/comments' articleUrl = 'https://page.dutenews.com/H5/sns/#/imgText?id=' + i response = requests.get(urls) content = response.content.decode('utf-8') title = re.compile('"feed_content":"(.*?)",').findall(content) titles = title[0].encode('utf-8').decode('unicode_escape') if len(title) == 0: title = re.compile('"title":"(.*?)",').findall( str(content)) pubTime = re.compile('"created_at":"(.*?)",').findall( str(content)) fmt = '%Y-%m-%dT%H:%M:%SZ' t = datetime.datetime.strptime(pubTime[0], fmt) # 东八区 t += datetime.timedelta(hours=8) getobj = GetValue2(content) contenttext = getobj.get_values('images') videotext = getobj.get_values('video') articlecontent = '' if contenttext is None: print() else: imgs = '' contenttexts = re.compile("'url': '(.*?)',").findall( str(contenttext)) for im in contenttexts: imgs += "<br><img src=\'" + im + "\'></img>" articlecontent += titles + imgs if videotext is None: print() else: videos = '' aa = GetValue2(videotext['resource']) videourl = aa.get_values('url') videos += "<br><video src='" + videourl + "' controls=" "></video>" articlecontent += titles + videos site = "读特-鹏友圈" siteId = 1048212 data = [] articleStatue = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data.append( InsertOne({ "url": articleUrl, "title": titles, "pub_time": t, "content": articlecontent, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": i, 'push_state': articleStatue, })) insertdb(data) except Exception as err: import traceback traceback.print_exc() pass
def importReference(self,filename): self.logg.info('Starting Reference Import') if filename == None: self.logg.info('Filename Not Set | Beginning File Search') fileDir = os.path.dirname(os.path.abspath(__file__)) regex = re.compile('^Geography\_\d{8}\_\B(to)_\d{8}\_\B(from)\_\d{8}(.txt.gz)') for root, dirs, files in os.walk(fileDir): for file in files: if regex.match(file): self.logg.info('Compatible File Found') filename = os.path.join(os.path.abspath(root), file) # Reference File is .tsv self.logg.info('Starting Reading Reference File') my_cols = ["TYPE","A", "B", "C", "D", "E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"] # Initialise docType Counters totalImportedDocuments = [0,0,0,0,0,0,0,0] bulkCounter = 0 # Open file & read into pd.DataFrame with gzip.open(filename, "rt", encoding="cp1252") as f: df = pd.read_csv(f, sep="\t",names=my_cols) self.logg.info('Importing Reference into MongoDB') operations = [] for row in df.itertuples(index=False): # Reset out_file out_file = None if row[0] == 'PIF': # REFTYPE : Document Specification out_file = { "docType": "PIF", "fileVersion": row[1], "sourceSystem": row[2], "TOCid": row[3], "timetableStartDate": row[4], "timetableEndDate": row[5], "cycleType": row[6], "cycleStage": row[7], "creationDate": row[8], "fileSequenceNumber": row[9], } totalImportedDocuments[0] += 1 elif row[0] == "REF": # REFTYPE : Reference Code out_file = { "docType": "REF", "actionCode": row[1], "codeType": row[2], "description": row[3] } totalImportedDocuments[1] += 1 elif row[0] == "TLD": # REFTYPE : Timing Load out_file = { "docType": "TLD", "actionCode": row[1], "tractionType": row[2], "trailingLoad": row[3], "speed": row[4], "raGauge": row[5], "description": row[6], "ITPSPowerType": row[7], "ITPSLoad": row[8], "limitingSpeed": row[9], } totalImportedDocuments[2] += 1 elif row[0] == "LOC": # REFTYPE : Geographical Data out_file = { "docType": "LOC", "actionCode": row[1], "TIPLOC": row[2], "locationName": row[3], "startDate": row[4], "endDate": row[5], "northing": row[6], "easting": row[7], "timingType": row[8], "zone": row[9], "STANOX": row[10], "offNetwork": row[11], "forceLPB": row[12], } totalImportedDocuments[3] += 1 elif row[0] == "PLT": # REFTYPE : Platform out_file = { "docType": "PLT", "actionCode": row[1], "locationCode": row[2], "platformID": row[3], "startDate": row[4], "endDate": row[5], "length": row[6], "powerSupplyType": row[7], "DDOPassenger": row[8], "DDONonPassenger": row[9], } totalImportedDocuments[4] += 1 elif row[0] == "NWK": # REFTYPE : Network Link out_file = { "docType": "NWK", "actionCode": row[1], "originLocation": row[2], "destinationLocation": row[3], "lineCode": row[4], "lineDescription": row[5], "startDate": row[6], "endDate": row[7], "initialDirection": row[8], "finalDirection": row[9], "DDOPassenger": row[10], "DDONonPassenger": row[11], "RETB": row[12], "zone": row[13], "reversible": row[14], "powerSupplyType": row[15], "RA": row[16], "maxTrainLength": row[17], } totalImportedDocuments[5] += 1 elif row[0] == "TLK": # REFTYPE : Timing Link out_file = { "docType": "NWK", "actionCode": row[1], "originLocation": row[2], "destinationLocation": row[3], "lineCode": row[4], "tractionType": row[5], "trailingLoad": row[6], "speed": row[7], "RA": row[8], "entrySpeed": row[9], "exitSpeed": row[10], "startDate": row[11], "endDate": row[12], "secRunTime": row[13], "description": row[14], } totalImportedDocuments[6] += 1 else: out_file = { "docType": "DEL" } totalImportedDocuments[7] += 1 # Append Copy of Dictionary to List operations.append( InsertOne(out_file.copy()) ) if ((sum(totalImportedDocuments) % self.inst['standard-bulk-size']) == 0): try: self.mongodb['reference'].bulk_write(operations) except BulkWriteError as bwe: self.logg.error(bwe.details) else: # Reset Bulk Storage operations = [] # Increment Counter bulkCounter +=1 self.logg.info('REFERENCE Progress | {:.0%} | {} Inserts'.format(sum(totalImportedDocuments)/1.2e6, bulkCounter)) self.logg.info('Completed Reference Import Successfully')
def _refresh_object(cls, managed_object): from noc.sa.models.managedobject import ManagedObject def to_dict(v): return dict((r["profile"], r["summary"]) for r in v) def to_list(v): return [{"profile": k, "summary": v[k]} for k in sorted(v)] if hasattr(managed_object, "id"): managed_object = managed_object.id coll = ServiceSummary._get_collection() bulk = [] # Get existing summary old_summary = dict((x["interface"], x) for x in coll.find( {"managed_object": managed_object}, { "_id": 1, "interface": 1, "service": 1, "subscriber": 1 }, comment= "[servicesummary._refresh_object] Refresh summary of services for managed object" )) # Get actual summary new_summary = ServiceSummary.build_summary_for_object(managed_object) # Merge summaries for iface in old_summary: if iface not in new_summary: # Stale, delete bulk += [DeleteOne({"_id": old_summary[iface]["_id"]})] continue oi = old_summary[iface] old_services = to_dict(oi["service"]) old_subs = to_dict(oi["subscriber"]) ni = new_summary[iface] if old_services != ni["service"] or old_subs != ni["subscriber"]: # Changed, update bulk += [ UpdateOne({"_id": oi["_id"]}, { "$set": { "service": to_list(ni["service"]), "subscriber": to_list(ni["subscriber"]) } }) ] # Mark as processed del new_summary[iface] # Process new items bulk += [ InsertOne({ "managed_object": managed_object, "interface": iface, "service": to_list(new_summary[iface]["service"]), "subscriber": to_list(new_summary[iface]["subscriber"]) }) for iface in new_summary ] if bulk: logger.info("Committing changes to database") try: r = coll.bulk_write(bulk, ordered=False) logger.info("Database has been synced") logger.info("Modify: %d, Deleted: %d", r.modified_count, r.deleted_count) except BulkWriteError as e: logger.error("Bulk write error: '%s'", e.details) logger.error("Stopping check") mo = ManagedObject.get_by_id(managed_object) mo.segment.update_summary()
def post(self): try: args = self.parser.parse(self.params_check, request) request_body = request.get_json() snapshot_name = args['snapshot_name'] operation = args['operation'] if operation == "import": filter_dict = {"snapshot_name": snapshot_name} if check_duplicated(current_app.mongo.db.snapshot, **filter_dict): return JsonRes( info={}, usr_err_mes= "Duplicated Snapshot name, please change another name!", status=False, data=[], code=400) # create snapshot instance snapshot_instance = {'snapshot_name': snapshot_name} # snapshot_res = current_app.mongo.db.snapshot.insert_one(snapshot_instance) current_app.mongo.db.snapshot.insert_one(snapshot_instance) # create topo instances topo_instance = {'snapshot_name': snapshot_name} if 'nodes' in request_body.keys(): topo_instance['nodes'] = request_body['nodes'] # topo_instance.update({ # 'nodes': request_body['nodes'] # }) if 'links' in request_body.keys(): topo_instance['links'] = request_body['links'] # topo_instance.update({ # 'links': request_body['links'] # }) # topo_res = current_app.mongo.db.topo.insert_one(topo_instance) current_app.mongo.db.topo.insert_one(topo_instance) # create policy instances if 'sr_policy' in request_body.keys(): sr_policies = request_body['sr_policy'] req_list = [] for policy in sr_policies: policy_instance = {'snapshot_name': snapshot_name} policy_instance.update(policy) req_list.append(InsertOne(policy_instance)) # policies_res = current_app.mongo.db.policy.bulk_write(req_list) current_app.mongo.db.policy.bulk_write(req_list) # # create global_params instances if 'global_params' in request_body.keys(): params_instance = {'snapshot_name': snapshot_name} params_instance.update(request_body['global_params']) # params_res = current_app.mongo.db.params.insert_one(params_instance) current_app.mongo.db.params.insert_one(params_instance) return JsonRes(data={}, info={}, code=201) if operation == "export": if not self.snapshot_validate(snapshot_name): return JsonRes( info={}, usr_err_mes= "No Snapshot named %s is found! Please import snapshot first!" % snapshot_name, err_code=0, status=False, data=[]) topo_res = current_app.mongo.db.topo.find_one( {'snapshot_name': snapshot_name}) del topo_res['snapshot_name'] policy_res = current_app.mongo.db.policy.find( {'snapshot_name': snapshot_name}) policy_list = [] for policy in policy_res: del policy['snapshot_name'] del policy['_id'] policy_list.append(policy) res = { 'nodes': topo_res['nodes'], 'links': topo_res['links'], 'sr_policy': policy_list } if 'with_params' in args.keys() and args['with_params']: params_res = current_app.mongo.db.params.find_one( {'snapshot_name': snapshot_name}) del params_res['snapshot_name'] del params_res['_id'] res.update({'global_params': params_res}) return JsonRes(data=res, info={}, code=200) except Exception as e: LOG.debug(e) return JsonRes(info=e, usr_err_mes="MongoDB Update Exception!", status=False, data=[], code=400)
sum = 0 for docs in coll.find(): sum = sum + int(doc['price']) print('Total Sales Price ${:,}'.format(sum)) # Bulk writing from pymongo import InsertOne, DeleteOne, ReplaceOne db = client.new_dump_db coll = db.num_coll # upsert will input the new object even if it does not find the one it is # replacing requests = [ InsertOne({'Binel': 100}), DeleteOne({'Binel': 100}), ReplaceOne({'Binel': 100}, {'Ben': 1000}, upsert=True) ] results = coll.bulk_write(requests) print('Final writes are: ', results.inserted_count) # Deletions based on criteria # Need to re-assign the origional db though for this to work db = client.new_db coll = db.db_collection
mongo_url = inifile.get('con', 'mongo_url') max_data_num = int(inifile.get('insert', 'bulk_max_count')) wtime_out_millsec = int(inifile.get('insert', 'w_concern_repl_timeout_milisec')) write_concern_opt = inifile.get('insert', 'w_concern_opt') client = MongoClient(mongo_url) timestamp = datetime.now().strftime("%Y/%m/%d %H:%M:%S%f") db = client.repltestdb print "@@@ Inserting bulk data %s with write_concern ..." % (max_data_num) coll = db.get_collection('testcol01', write_concern=WriteConcern( w=write_concern_opt, wtimeout=wtime_out_millsec)) try: coll.bulk_write([ InsertOne({ "timestamp": timestamp, 'id': i }) for i in range(max_data_num) ]) print "OK." except BulkWriteError as bwe: pprint(bwe.details) data_count = db.testcol01.find({}).count() print "count = %s" % data_count print "Done!"
async def importfbans_func(message, fed, strings, document=None): global user_id file_type = os.path.splitext(document["file_name"])[1][1:] if file_type == "json": if document["file_size"] > 1000000: await message.reply(strings["big_file_json"].format(num="1")) return elif file_type == "csv": if document["file_size"] > 52428800: await message.reply(strings["big_file_csv"].format(num="50")) return else: await message.reply(strings["wrong_file_ext"]) return f = await bot.download_file_by_id(document.file_id, io.BytesIO()) msg = await message.reply(strings["importing_process"]) data = None if file_type == "json": try: data = rapidjson.load(f).items() except ValueError: return await message.reply(strings["invalid_file"]) elif file_type == "csv": data = csv.DictReader(io.TextIOWrapper(f)) real_counter = 0 queue_del = [] queue_insert = [] current_time = datetime.now() for row in data: if file_type == "json": user_id = row[0] data = row[1] elif file_type == "csv": if "user_id" in row: user_id = int(row["user_id"]) elif "id" in row: user_id = int(row["id"]) else: continue else: raise NotImplementedError new = {"fed_id": fed["fed_id"], "user_id": user_id} if "reason" in row: new["reason"] = row["reason"] if "by" in row: new["by"] = int(row["by"]) else: new["by"] = message.from_user.id if "time" in row: new["time"] = datetime.fromtimestamp(int(row["time"])) else: new["time"] = current_time if "banned_chats" in row and type(row["banned_chats"]) is list: new["banned_chats"] = row["banned_chats"] queue_del.append( DeleteMany({ "fed_id": fed["fed_id"], "user_id": user_id })) queue_insert.append(InsertOne(new)) if len(queue_insert) == 1000: real_counter += len(queue_insert) # Make delete operation ordered before inserting. if queue_del: await db.fed_bans.bulk_write(queue_del, ordered=False) await db.fed_bans.bulk_write(queue_insert, ordered=False) queue_del = [] queue_insert = [] # Process last bans real_counter += len(queue_insert) if queue_del: await db.fed_bans.bulk_write(queue_del, ordered=False) if queue_insert: await db.fed_bans.bulk_write(queue_insert, ordered=False) await msg.edit_text(strings["import_done"].format(num=real_counter))
async def importfbans_func(message, fed, strings, document=None): global user_id file_type = os.path.splitext(document['file_name'])[1][1:] if file_type == 'json': if document['file_size'] > 1000000: await message.reply(strings['big_file_json'].format(num='1')) return elif file_type == 'csv': if document['file_size'] > 52428800: await message.reply(strings['big_file_csv'].format(num='50')) return else: await message.reply(strings['wrong_file_ext']) return f = await bot.download_file_by_id(document.file_id, io.BytesIO()) msg = await message.reply(strings['importing_process']) data = None if file_type == 'json': try: data = rapidjson.load(f).items() except ValueError: return await message.reply(strings['invalid_file']) elif file_type == 'csv': data = csv.DictReader(io.TextIOWrapper(f)) real_counter = 0 queue_del = [] queue_insert = [] current_time = datetime.now() for row in data: if file_type == 'json': user_id = row[0] data = row[1] elif file_type == 'csv': if 'user_id' in row: user_id = int(row['user_id']) elif 'id' in row: user_id = int(row['id']) else: continue else: raise NotImplementedError new = {'fed_id': fed['fed_id'], 'user_id': user_id} if 'reason' in row: new['reason'] = row['reason'] if 'by' in row: new['by'] = int(row['by']) else: new['by'] = message.from_user.id if 'time' in row: new['time'] = datetime.fromtimestamp(int(row['time'])) else: new['time'] = current_time if 'banned_chats' in row and type(row['banned_chats']) is list: new['banned_chats'] = row['banned_chats'] queue_del.append( DeleteMany({ 'fed_id': fed['fed_id'], 'user_id': user_id })) queue_insert.append(InsertOne(new)) if len(queue_insert) == 1000: real_counter += len(queue_insert) # Make delete operation ordered before inserting. if queue_del: await db.fed_bans.bulk_write(queue_del, ordered=False) await db.fed_bans.bulk_write(queue_insert, ordered=False) queue_del = [] queue_insert = [] # Process last bans real_counter += len(queue_insert) if queue_del: await db.fed_bans.bulk_write(queue_del, ordered=False) if queue_insert: await db.fed_bans.bulk_write(queue_insert, ordered=False) await msg.edit_text(strings['import_done'].format(num=real_counter))
partners = ['Xi', 'Moon', 'Xi', 'Moon', 'Trump', 'Xi'] for i in range(6): events.append({ 'date': dates[i], 'loc': locs[i], 'partner': partners[i] }) return events if __name__ == '__main__': events = get_events() # 事件信息放入MongoDB数据库 requests_ = [ InsertOne({ '_id': hash(i['date'] + i['loc'] + i['partner'] + str(random.randint(0, 100))), 'event': i }) for i in tqdm(events) ] try: result = db.event_list.bulk_write(requests_) pprint(result.bulk_api_result) except BulkWriteError as bwe: pprint(bwe.details) client.close()
def my_job(): for i in range(25): print("222") try: cc = i + 1 urlss = "http://www.wangdaisj.com/forum-41-" + str(cc) + ".html" response = requests.get(urlss, headers=headers, cookies=cookies, verify=False) print("333") content = response.content.decode("utf-8") url = re.compile('<a href="thread(.*?)"').findall(str(content)) for ur in url: print("444") urls = "http://www.wangdaisj.com/thread" + ur print(urls) responses = requests.get(urls, headers=headers, cookies=cookies, verify=False) print(responses.status_code) articleContent = responses.content.decode("utf-8") title = re.compile( '<meta name="keywords" content="(.*?)" />').findall( str(articleContent)) print(title) bs = BeautifulSoup(articleContent, 'html.parser') print('666') canshu = re.compile( '<div id="post_(\d{1,10})" class="[\s\S]*?."').findall( str(articleContent)) print('7777') site = "一线生活-深圳社区" siteId = 1046280 push_state = 0 for cs in canshu: try: print("555") con = bs.find_all(attrs={'id': 'post_' + cs}) cons = con[0] pubTieme = re.compile( '发表于 <span title="(.*?)">').findall(str(cons)) if pubTieme: print("111") else: pubTieme = re.compile('发表于 (.*?)</em>').findall( str(cons)) onlyId = pubTieme[0] + cs contents = re.compile( '<div class="t_fsz">([\s\S]*?.)</div>').findall( str(cons)) downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = [] data.append( InsertOne({ "url": urls, "title": title[0], "aid": cs, "content": contents[0], "site": site, "pub_time": pubTieme[0], "only_id": onlyId, "push_state": push_state, "site_id": siteId, "download_Time": downloadTime })) insertdb(data) except Exception as err: import traceback print(urls) traceback.print_exc() pass finally: client.close() except Exception as err: import traceback print(urls) traceback.print_exc() pass