def update_values(self, pd_obj, col_key): if pd_obj.empty: print('pd_obj empty, no data to update') return # initial checks if isinstance(col_key, str): col_key = [col_key] if len(self._key_ls) != len(col_key): name = self._name full = len(self._key_ls) given = len(col_key) raise Exception(f'insert error: {name}. col_key must have {full} elements. provided was {given}') if isinstance(pd_obj, pd.Series): df = pd.DataFrame(pd_obj) else: df = pd_obj.copy() # check no dups if df.columns.has_duplicates: raise Exception('inserted object cannot have duplicated keys!') # integrity check for col in df.columns: col_val_ls = col.split('.') if len(self._full_key_ls) - 1 != len(col_val_ls): raise Exception(f'all columns must have {len(col_key)} elements. provided was {col_val_ls}') df.index = df.index.map(lambda i: i.strftime('%Y%m%d')) df = df.dropna(axis=0, how='all') df = df.dropna(axis=1, how='all') request_ls = [] for col in df.columns: col_val_ls = col.split('.') tup = zip(col_key, col_val_ls) query_dd = {k: v for k, v in tup} values_dd = df[col].dropna().to_dict() # pull many date_ls = list(values_dd.keys()) update_dd = { '$pull': {'values': {'date': {'$in': date_ls}}} } request = pymongo.UpdateMany(query_dd, update_dd, upsert=True) request_ls.append(request) # push many new_ls = [{'date': k, 'value': v} for k, v in values_dd.items()] update_dd = {'$push': {'values': {'$each': new_ls}}} request = pymongo.UpdateMany(query_dd, update_dd, upsert=True) request_ls.append(request) self._get_collection().bulk_write(request_ls, ordered=True)
def merge_players(self, src_player_id, target_player_id): # TODO this can probably be tested properly only with integration tests self.db.badge.bulk_write([ pymongo.UpdateMany({'player_id': src_player_id}, {'$set': { 'player_id': target_player_id }}) ]) self.db.special_score.bulk_write([ pymongo.UpdateMany({'killer_id': src_player_id}, {'$set': { 'killer_id': target_player_id }}), pymongo.UpdateMany({'victim_id': src_player_id}, {'$set': { 'victim_id': target_player_id }}) ]) self.db.kill.bulk_write([ pymongo.UpdateMany({'killer_id': src_player_id}, {'$set': { 'killer_id': target_player_id }}), pymongo.UpdateMany({'victim_id': src_player_id}, {'$set': { 'victim_id': target_player_id }}) ]) self.db.score.bulk_write([ pymongo.UpdateMany({'player_id': src_player_id}, {'$set': { 'player_id': target_player_id }}) ]) self.db.team_switch.bulk_write([ pymongo.UpdateMany({'player_id': src_player_id}, {'$set': { 'player_id': target_player_id }}) ]) rows = self.db.player_merge.find({ 'src_player_id': src_player_id, 'target_player_id': target_player_id }) if not list(rows): self.db.player_merge.insert_one({ 'src_player_id': src_player_id, 'target_player_id': target_player_id })
def update_sramongo_pubmed_records(docs, collection): db_operations = [] for doc in docs: db_operations.append( pymongo.UpdateMany( {"study.pubmed": doc.accn}, {"$addToSet": {"papers": doc.to_mongo()}} ) ) # Write intermediate results if len(db_operations) > 500: res = collection.bulk_write(db_operations) logger.debug(res.bulk_api_result) db_operations = [] if db_operations: res = collection.bulk_write(db_operations) logger.debug(res.bulk_api_result)
def update_sramongo_biosample_records(docs, collection): db_operations = [] for doc in docs: db_operations.append( pymongo.UpdateMany( {"sample.biosample": doc.accn}, {"$set": {"BioSample": doc.to_mongo()}} ) ) # Write intermediate results if len(db_operations) > 500: res = collection.bulk_write(db_operations) logger.debug(res.bulk_api_result) db_operations = [] if db_operations: res = collection.bulk_write(db_operations) logger.debug(res.bulk_api_result)
def add_field(): col1 = connectTable("qiuzh", "mag_papers") for field in [ "地学", "地学天文", "工程技术", "管理科学", "化学", "环境科学与生态学", "农林科学", "社会科学", "生物", "数学", "物理", "医学", "综合性期刊" ]: operation = [] print(field) journal_detail = open( "C://Users//qzh//PycharmProjects//MAG//JournalDetailsWithID//" + field + ".txt", "r", encoding="gbk") for line in journal_detail: a_journal_detail = json.loads(line) journal_ID = a_journal_detail[8] journal_field = a_journal_detail[6][0].replace( "地学", "GEOGRAPHY").replace("地学天文", "ASTRONOMY").replace( "工程技术", "ENGINEERING").replace("管理科学", "MANAGEMENT").replace( "化学", "CHEMISTRY" ).replace("环境科学与生态学", "ENVIRONMENTAL SCIENCES").replace( "农林科学", "AGRONOMY").replace("社会科学", "SOCIAL SCIENCE").replace( "生物", "BIOLOGY").replace("数学", "MATHEMATICS").replace( "物理", "PHYSICS").replace("医学", "MEDICINE").replace( "综合性期刊", "MULTIDISCIPLINARY SCIENCES") journal_level = a_journal_detail[6][1] operation.append( pymongo.UpdateMany( {"venue.id": journal_ID}, {"$set": { "field": journal_field, "level": journal_level }})) print(len(operation)) col1.bulk_write(operation, ordered=False)
age_bins = { '<1': [0, 1], '1-5': [1, 5], '6-10': [6, 10], '11-17': [11, 17], '18-34': [18, 34], '35-49': [35, 49], '50-64': [50, 64], '65+': [65, 300] } # Cycle through the age bins to perform the groupings for index in age_bins: min_age = age_bins[index][0] max_age = age_bins[index][1] bin_name = index # Detail the requested change to make requests = [ pymongo.UpdateMany( {args.age_variable: { "$gte": min_age, "$lte": max_age }}, {"$set": { args.bin_variable: bin_name }}) ] # Perform the operation result = db[args.collection_name].bulk_write(requests)
def merge_players(self, src_player_id, target_player_id): # TODO this can probably be tested properly only with integration tests self.db.badge.bulk_write([ pymongo.UpdateMany( {"player_id": src_player_id}, {"$set": { "player_id": target_player_id }}, ) ]) self.db.special_score.bulk_write([ pymongo.UpdateMany( {"killer_id": src_player_id}, {"$set": { "killer_id": target_player_id }}, ), pymongo.UpdateMany( {"victim_id": src_player_id}, {"$set": { "victim_id": target_player_id }}, ), ]) self.db.kill.bulk_write([ pymongo.UpdateMany( {"killer_id": src_player_id}, {"$set": { "killer_id": target_player_id }}, ), pymongo.UpdateMany( {"victim_id": src_player_id}, {"$set": { "victim_id": target_player_id }}, ), ]) self.db.score.bulk_write([ pymongo.UpdateMany( {"player_id": src_player_id}, {"$set": { "player_id": target_player_id }}, ) ]) self.db.team_switch.bulk_write([ pymongo.UpdateMany( {"player_id": src_player_id}, {"$set": { "player_id": target_player_id }}, ) ]) self.db.player_stats.bulk_write([ pymongo.UpdateMany( {"player_id": src_player_id}, {"$set": { "player_id": target_player_id }}, ) ]) rows = self.db.player_merge.find({ "src_player_id": src_player_id, "target_player_id": target_player_id, }) if not list(rows): self.db.player_merge.insert_one({ "src_player_id": src_player_id, "target_player_id": target_player_id, })
def save_annotations( dsFolder: types.GirderModel, upsert_list: Iterable[dict], delete_list: Iterable[int], user: types.GirderUserModel, description="save", overwrite=False, ): """ Annotations are lazy-deleted by marking their staleness property as true. """ datasetId = dsFolder['_id'] expire_operations = [] # Mark existing records as deleted expire_result = {} insert_operations = [] # Insert new records insert_result = {} new_revision = get_last_revision(dsFolder) + 1 delete_annotation_update = {'$set': {REVISION_DELETED: new_revision}} if overwrite: query = {DATASET: datasetId, REVISION_DELETED: {'$exists': False}} expire_result = (AnnotationItem().collection.bulk_write( [pymongo.UpdateMany(query, delete_annotation_update)]).bulk_api_result) for track_id in delete_list: filter = { TRACKID: track_id, DATASET: datasetId, REVISION_DELETED: { '$exists': False } } # UpdateMany for safety, UpdateOne would also work expire_operations.append( pymongo.UpdateMany(filter, delete_annotation_update)) for newdict in upsert_list: newdict.update({DATASET: datasetId, REVISION_CREATED: new_revision}) newdict.pop(REVISION_DELETED, None) filter = { TRACKID: newdict['trackId'], DATASET: datasetId, REVISION_DELETED: { '$exists': False }, } if not overwrite: # UpdateMany for safety, UpdateOne would also work expire_operations.append( pymongo.UpdateMany(filter, delete_annotation_update)) insert_operations.append(pymongo.InsertOne(newdict)) # Ordered=false allows fast parallel writes if len(expire_operations): expire_result = (AnnotationItem().collection.bulk_write( expire_operations, ordered=False).bulk_api_result) if len(insert_operations): insert_result = (AnnotationItem().collection.bulk_write( insert_operations, ordered=False).bulk_api_result) additions = insert_result.get('nInserted', 0) deletions = expire_result.get('nModified', 0) if additions or deletions: # Write the revision to the log log_entry = models.RevisionLog( dataset=datasetId, author_name=user['login'], author_id=user['_id'], revision=new_revision, additions=additions, deletions=deletions, description=description, ) RevisionLogItem().create(log_entry) return {"updated": additions, "deleted": deletions}
def retweetFilterCleansing(): masterData = [] rawData = [] retweetCountUpdate = [] dataA = db.retweet_raw_data.find({"addData":"incomplete"}) for item in dataA: try: if "extended_tweet" in item['data']['retweeted_status']: strTime = addHours(item['data']['retweeted_status']['created_at']) getDate = strTime.split(" ") getHour = getDate[3].split(":") month = getMonth(getDate[1]) if "extended_entities" in item['data']['retweeted_status']: db.master_data.insert_one( { 'university':item['university'], 'keyword':item['keyword'], 'id_str': item['data']['retweeted_status']['id_str'], 'created_at': strTime, 'text':item['data']['retweeted_status']['extended_tweet']['full_text'], 'user_id':item['data']['retweeted_status']['user']['id_str'], 'user_name':item['data']['retweeted_status']['user']['name'], 'user_img':item['data']['retweeted_status']['user']['profile_image_url_https'], 'user_followers':item['data']['retweeted_status']['user']['followers_count'], 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'hashtags':item['data']['retweeted_status']['extended_tweet']['entities']['hashtags'], 'checkImg':True, 'img':item['data']['retweeted_status']['extended_entities']['media'][0]['media_url_https'], 'retweet_1Day':0, 'timeCreate':dt.datetime(int(getDate[5]), month, int(getDate[2]),int(getHour[0]),int(getHour[1])), 'timeUpdate':dt.datetime.today() } ) else: db.master_data.insert_one( { 'university':item['university'], 'keyword':item['keyword'], 'id_str': item['data']['retweeted_status']['id_str'], 'created_at': strTime, 'text':item['data']['retweeted_status']['extended_tweet']['full_text'], 'user_id':item['data']['retweeted_status']['user']['id_str'], 'user_name':item['data']['retweeted_status']['user']['name'], 'user_img':item['data']['retweeted_status']['user']['profile_image_url_https'], 'user_followers':item['data']['retweeted_status']['user']['followers_count'], 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'hashtags':item['data']['retweeted_status']['extended_tweet']['entities']['hashtags'], 'retweet_1Day':0, 'timeCreate':dt.datetime(int(getDate[5]), month, int(getDate[2]),int(getHour[0]),int(getHour[1])), 'timeUpdate':dt.datetime.today() } ) db.retweet_update_data.insert_one( { 'university':item['university'], 'id_str':item['data']['retweeted_status']['id_str'], 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'state_rt':0, 'state_check':0, 'timeUpdate':dt.datetime.today() } ) db.retweet_state_data.insert_one( { 'university':item['university'], 'id_str':item['data']['retweeted_status']['id_str'], 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'timeUpdate':dt.datetime.today() - dt.timedelta(minutes=1) } ) print('create_to_master') rawData.append((pymongo.UpdateOne( { 'id_str':item['id_str'] }, { '$set': { 'addData':'complete' } },upsert=True))) else: strTime = addHours(item['data']['retweeted_status']['created_at']) getDate = strTime.split(" ") getHour = getDate[3].split(":") month = getMonth(getDate[1]) if "extended_entities" in item['data']['retweeted_status']: db.master_data.insert_one( { 'university':item['university'], 'keyword':item['keyword'], 'id_str': item['data']['retweeted_status']['id_str'], 'created_at': strTime, 'text':item['data']['retweeted_status']['text'], 'user_id':item['data']['retweeted_status']['user']['id_str'], 'user_name':item['data']['retweeted_status']['user']['name'], 'user_img':item['data']['retweeted_status']['user']['profile_image_url_https'], 'user_followers':item['data']['retweeted_status']['user']['followers_count'], 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'hashtags':item['data']['retweeted_status']['entities']['hashtags'], 'checkImg':True, 'img':item['data']['retweeted_status']['extended_entities']['media'][0]['media_url_https'], 'retweet_1Day':0, 'timeCreate':dt.datetime(int(getDate[5]), month, int(getDate[2]),int(getHour[0]),int(getHour[1])), 'timeUpdate':dt.datetime.today() } ) else: db.master_data.insert_one( { 'university':item['university'], 'keyword':item['keyword'], 'id_str': item['data']['retweeted_status']['id_str'], 'created_at': strTime, 'text':item['data']['retweeted_status']['text'], 'user_id':item['data']['retweeted_status']['user']['id_str'], 'user_name':item['data']['retweeted_status']['user']['name'], 'user_img':item['data']['retweeted_status']['user']['profile_image_url_https'], 'user_followers':item['data']['retweeted_status']['user']['followers_count'], 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'hashtags':item['data']['retweeted_status']['entities']['hashtags'], 'retweet_1Day':0, 'timeCreate':dt.datetime(int(getDate[5]), month, int(getDate[2]),int(getHour[0]),int(getHour[1])), 'timeUpdate':dt.datetime.today() } ) db.retweet_update_data.insert_one( { 'university':item['university'], 'id_str':item['data']['retweeted_status']['id_str'], 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'state_rt':0, 'state_check':0, 'timeUpdate':dt.datetime.today() } ) db.retweet_state_data.insert_one( { 'university':item['university'], 'id_str':item['data']['retweeted_status']['id_str'], 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'timeUpdate':dt.datetime.today() - dt.timedelta(minutes=1) } ) print('create_to_master') rawData.append((pymongo.UpdateOne( { 'id_str':item['id_str'] }, { '$set': { 'addData':'complete' } },upsert=True))) except pymongo.errors.DuplicateKeyError: masterData.append((pymongo.UpdateOne( { 'id_str':item['data']['retweeted_status']['id_str'] }, { '$set': { 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'timeUpdate':dt.datetime.today() } },upsert=True))) retweetCountUpdate.append((pymongo.UpdateOne( { 'id_str':item['data']['retweeted_status']['id_str'] }, { '$set': { 'retweet_count':item['data']['retweeted_status']['retweet_count'], 'favorite_count':item['data']['retweeted_status']['favorite_count'], 'timeUpdate':dt.datetime.today() } },upsert=True))) print('update_to_master') rawData.append((pymongo.UpdateMany( { 'id_str':item['id_str'] }, { '$set': { 'addData':'complete' } },upsert=True))) if(len(masterData)>0): db.master_data.bulk_write(masterData,ordered=False) db.retweet_update_data.bulk_write(retweetCountUpdate,ordered=False) if(len(rawData)>0): db.retweet_raw_data.bulk_write(rawData,ordered=False)
def retweetFilterCleansing(): masterData = [] rawData = [] retweetCountUpdate = [] dataA = db.retweet_raw_data.find({"addData": "incomplete"}) for item in dataA: try: if "extended_tweet" in item['data']['retweeted_status']: strTime = addHours( item['data']['retweeted_status']['created_at']) db.master_data.insert_one({ 'university': item['university'], 'keyword': item['keyword'], 'id_str': item['data']['retweeted_status']['id_str'], 'created_at': strTime, 'text': item['data']['retweeted_status']['extended_tweet'] ['full_text'], 'user_id': item['data']['retweeted_status']['user']['id_str'], 'user_name': item['data']['retweeted_status']['user']['name'], 'user_screen-name': item['data']['retweeted_status']['user']['screen_name'], 'user_followers': item['data']['retweeted_status']['user'] ['followers_count'], 'retweet_count': item['data']['retweeted_status']['retweet_count'], 'favorite_count': item['data']['retweeted_status']['favorite_count'], 'hashtags': item['data']['retweeted_status']['extended_tweet'] ['entities']['hashtags'], 'timeUpdate': dt.datetime.today() }) db.retweet_update_data.insert_one({ 'university': item['university'], 'id_str': item['data']['retweeted_status']['id_str'], 'retweet_count': 0, 'favorite_count': 0, 'timeUpdate': dt.datetime.today() }) print('create_to_master') rawData.append( (pymongo.UpdateOne({'id_str': item['id_str']}, {'$set': { 'addData': 'complete' }}, upsert=True))) else: strTime = addHours( item['data']['retweeted_status']['created_at']) db.master_data.insert_one({ 'university': item['university'], 'keyword': item['keyword'], 'id_str': item['data']['retweeted_status']['id_str'], 'created_at': strTime, 'text': item['data']['retweeted_status']['text'], 'user_id': item['data']['retweeted_status']['user']['id_str'], 'user_name': item['data']['retweeted_status']['user']['name'], 'user_screen-name': item['data']['retweeted_status']['user']['screen_name'], 'user_followers': item['data']['retweeted_status']['user'] ['followers_count'], 'retweet_count': item['data']['retweeted_status']['retweet_count'], 'favorite_count': item['data']['retweeted_status']['favorite_count'], 'hashtags': item['data']['retweeted_status']['entities']['hashtags'], 'timeUpdate': dt.datetime.today() }) db.retweet_update_data.insert_one({ 'university': item['university'], 'id_str': item['data']['retweeted_status']['id_str'], 'retweet_count': 0, 'favorite_count': 0, 'timeUpdate': dt.datetime.today() }) print('create_to_master') rawData.append( (pymongo.UpdateOne({'id_str': item['id_str']}, {'$set': { 'addData': 'complete' }}, upsert=True))) except pymongo.errors.DuplicateKeyError: masterData.append((pymongo.UpdateOne( {'id_str': item['data']['retweeted_status']['id_str']}, { '$set': { 'user_name': item['data']['retweeted_status']['user']['name'], 'user_screen-name': item['data']['retweeted_status']['user'] ['screen_name'], 'user_followers': item['data']['retweeted_status']['user'] ['followers_count'], 'retweet_count': item['data']['retweeted_status']['retweet_count'], 'favorite_count': item['data']['retweeted_status']['favorite_count'], 'timeUpdate': dt.datetime.today() } }, upsert=True))) idUpdate = db.retweet_update_data.find_one( {'id_str': item['data']['retweeted_status']['id_str']}) masterId = db.master_data.find_one( {'id_str': item['data']['retweeted_status']['id_str']}) retweet = item['data']['retweeted_status'][ 'retweet_count'] - masterId['retweet_count'] favourite = item['data']['retweeted_status'][ 'favorite_count'] - masterId['favorite_count'] retweetCountUpdate.append((pymongo.UpdateOne( {'id_str': item['data']['retweeted_status']['id_str']}, { '$set': { 'retweet_count': idUpdate['retweet_count'] + retweet, 'favorite_count': idUpdate['favorite_count'] + favourite, 'timeUpdate': dt.datetime.today() } }, upsert=True))) print('update_to_master') rawData.append( (pymongo.UpdateMany({'id_str': item['id_str']}, {'$set': { 'addData': 'complete' }}, upsert=True))) if (len(masterData) > 0): db.master_data.bulk_write(masterData, ordered=False) db.retweet_update_data.bulk_write(retweetCountUpdate, ordered=False) if (len(rawData) > 0): db.retweet_raw_data.bulk_write(rawData, ordered=False)