def summarize_query_alignments(self): """For each query, summarize the number and quality of alignments THIS METHOD IS TOO SLOW! """ return query_maps = self.get_query_maps() with switch_collection(Alignment, self.alignment_collection) as A, \ switch_collection(Map, self.map_collection) as M: n = query_maps.count() for i, q in enumerate(query_maps): print 'working on map %i of %i %s'%(i+1, n, q.name) aln_summary_doc = MapAlignmentSummary() query_alns = A.objects.filter(query_id = q.name).order_by('total_score_rescaled') aln_count = query_alns.count() aln_summary_doc.aln_count = aln_count if aln_count > 0: best_aln = query_alns[0] # Cast to Alignment Embedded # best_aln = deepcopy(best_aln) # best_aln.__class__ = AlignmentEmbedded aln_summary_doc.best_aln = AlignmentEmbedded(**best_aln._data) aln_summary_doc.best_m_score = best_aln.m_score aln_summary_doc.best_query_miss_rate = best_aln.query_miss_rate aln_summary_doc.best_ref_miss_rate = best_aln.ref_miss_rate aln_summary_doc.best_query_scaling_factor = best_aln.query_scaling_factor q.alignment_summary = aln_summary_doc q.save()
def update_symptom_codes(): """ Update symptom codes from the old format to the new format. - Key difference: Do not use symptom code as a primary key anymore, since the codes have been deprected """ from dcmetrometrics.eles import models, defs from mongoengine.context_managers import switch_db, switch_collection # Fish out symptom codes in the old format from the symptom_code collection # Backup to the symptom_code_old collection with switch_collection(models.SymptomCodeOld, "symptom_codes") as SymptomCodeOld: old_symptoms = list(models.SymptomCodeOld.objects) for s in old_symptoms: # Make a backup of the old symptom codes s.switch_collection('symptom_codes_old') s.save() # Save to the new collection # Remove the symptom collection - out with the old, in with the new! models.SymptomCode.drop_collection() # Clears the "symptom_code" collection with switch_collection(models.SymptomCodeOld, "symptom_codes_old") as SymptomCodeOld: s_old = list(SymptomCodeOld.objects) for s in s_old: s_new = s.make_new_format() if not s_new.category: s_new.category = defs.symptomToCategory[s_new.description] print "saving: ", s_new s_new.save()
def process_one_file(filepath, dir_path, filename, cover_mode): ''' '对普通文件进行处理 '若该文件记录已在db中存在,按照cover_mode进行覆盖或跳过操作,若文件在db中不存在,则添加到db中 ''' global Mybucket dirId = getDirId(dir_path) # 获取文件所在目录的id if isFileExists(filename, dirId): # 若文件在db已存在 if cover_mode: with switch_collection(Mybucket, collection_name) as Mybucket: for u in Mybucket.objects(Q(na=filename) & Q(did=dirId)): # 删除原记录和对象 object_to_delete = getObjectId(filename, dirId) #对象名 delete_object(str(object_to_delete)) #删除rados对象 u.delete() size = os.path.getsize(filepath) # 获取文件大小,单位字节 Mybucket(na=filename, fod=True, did=dirId, si=size).save() # 添加新纪录 obj_name = getObjectId(filename, dirId) fo = open(filepath, 'rb') storeToRados(str(obj_name), fo, size) #写入rados fo.close() else: size = os.path.getsize(filepath) # 获取文件大小,单位字节 with switch_collection(Mybucket, collection_name) as Mybucket: Mybucket(na=filename, fod=True, did=dirId, si=size).save() # 添加新纪录 obj_name = getObjectId(filename, dirId) fo = open(filepath, 'rb') storeToRados(str(obj_name), fo, size) #写入rados fo.close()
def test_switch_collection_context_manager(self): connect('mongoenginetest') register_connection('testdb-1', 'mongoenginetest2') class Group(Document): name = StringField() Group.drop_collection() with switch_collection(Group, 'group1') as Group: Group.drop_collection() Group(name="hello - group").save() self.assertEqual(1, Group.objects.count()) with switch_collection(Group, 'group1') as Group: self.assertEqual(0, Group.objects.count()) Group(name="hello - group1").save() self.assertEqual(1, Group.objects.count()) Group.drop_collection() self.assertEqual(0, Group.objects.count()) self.assertEqual(1, Group.objects.count())
def test_switch_collection_context_manager(self): clear_document_registry() connect("mongoenginetest") register_connection(alias="testdb-1", db="mongoenginetest2") class Group(Document): name = StringField() Group.drop_collection() # drops in default with switch_collection(Group, "group1") as Group: Group.drop_collection() # drops in group1 Group(name="hello - group").save() assert 1 == Group.objects.count() with switch_collection(Group, "group1") as Group: assert 0 == Group.objects.count() Group(name="hello - group1").save() assert 1 == Group.objects.count() Group.drop_collection() assert 0 == Group.objects.count() assert 1 == Group.objects.count()
def delete(self, *args, **kwargs): """Delete the experiment instance and related collections""" with switch_collection(Alignment, self.alignment_collection) as A: A.drop_collection() with switch_collection(Map, self.map_collection) as M: M.drop_collection() Document.delete(self, *args, **kwargs)
def get_data_rumor(ID): with switch_collection(Stock, 'TRD_T') as StockS: data = StockS.objects(Stkcd=ID).all() date = [x['Trddt'] for x in data] # data_temp = [[x['Opnprc'], x['Hiprc'], x['Loprc'], x['Clsprc'], x['Dnshrtrd'], x['Dnvaltrd'], # x['Dsmvosd'], x['Dsmvtll'], x['Dretwd'], x['Dretnd'], x['Adjprcwd'], x['Adjprcnd'], # x['Markettype'], x['Trdsta']] for x in data] data_temp = [[ x['Opnprc'], x['Hiprc'], x['Loprc'], x['Clsprc'], x['Dnshrtrd'], x['Dnvaltrd'], x['Dsmvosd'], x['Dsmvtll'], x['Dretwd'], x['Adjprcwd'] ] for x in data] data_form = form_data(data_temp) """ data_form = form_data([[x['Opnprc'], x['Hiprc'], x['Loprc'], x['Clsprc']] for x in data]) """ x_train = [] y_train = [] length = len(data_form) with switch_collection(Rumor, 'TRD_rumor') as RumorS: rumor = RumorS.objects(Stkcd=ID).all() rumor_dict = {} for item in rumor: rumor_dict.setdefault(item['Qdate'].replace('/', '-'), [ item['QLabel'], item['QPositive'], item['QNegtive'], item['ALabel'], item['APositive'], item['ANegtive'] ]) for i in range(length): if i < length - 5: temp_time = [] for j in range(5): daily = copy.deepcopy(data_form[i + j]) if date[i + j] in rumor_dict: if rumor_dict[date[i + j]][0]: daily.extend(rumor_dict[date[i + j]]) else: daily.extend(rumor_dict[date[i + j]]) else: daily.append(0) daily.append(0) daily.append(0) daily.append(0) daily.append(0) daily.append(0) temp_time.append(daily) x_train.append(temp_time) # if (data_temp[i + 0][3] + data_temp[i + 1][3] + data_temp[i + 2][3] + data_temp[i + 3][3] + data_temp[i + 4][3] + data_temp[i + 5][3] + data_temp[i + 6][3] + data_temp[i + 7][3]+ data_temp[i + 8][3] + data_temp[i + 9][3] + data_temp[i + 10][3] + data_temp[i + 11][3] + data_temp[i + 12][3] + data_temp[i + 13][3] + data_temp[i + 14][3]) < \ # (data_temp[i + 15][3] + data_temp[i + 16][3] + data_temp[i + 17][3] + data_temp[i + 18][3] + data_temp[i + 19][3] + data_temp[i + 20][3] + data_temp[i + 21][3] + data_temp[i + 22][3]+ data_temp[i + 23][3] + data_temp[i + 24][3] + data_temp[i + 25][3] + data_temp[i + 26][3] + data_temp[i + 27][3] + data_temp[i + 28][3] + data_temp[i + 29][3]): if data_temp[i + 4][3] > data_temp[i + 5][3]: y_train.append(0) else: y_train.append(1) return x_train, y_train
def load(filename=DEFAULT_GEOZONES_FILE, drop=False): ''' Load a geozones archive from <filename> <filename> can be either a local path or a remote URL. ''' ts = datetime.now().isoformat().replace('-', '').replace(':', '').split('.')[0] prefix = 'geozones-{0}'.format(ts) if filename.startswith('http'): log.info('Downloading GeoZones bundle: %s', filename) # Use tmp.open to make sure that the directory exists in FS with tmp.open(GEOZONE_FILENAME, 'wb') as newfile: newfile.write(requests.get(filename).content) filename = tmp.path(GEOZONE_FILENAME) log.info('Extracting GeoZones bundle') with handle_error(prefix): with contextlib.closing(lzma.LZMAFile(filename)) as xz: with tarfile.open(fileobj=xz) as f: f.extractall(tmp.path(prefix)) log.info('Loading GeoZones levels') log.info('Loading levels.msgpack') levels_filepath = tmp.path(prefix + '/levels.msgpack') if drop and GeoLevel.objects.count(): name = '_'.join((GeoLevel._get_collection_name(), ts)) target = GeoLevel._get_collection_name() with switch_collection(GeoLevel, name): with handle_error(prefix, GeoLevel): total = load_levels(GeoLevel, levels_filepath) GeoLevel.objects._collection.rename(target, dropTarget=True) else: with handle_error(prefix): total = load_levels(GeoLevel, levels_filepath) log.info('Loaded {total} levels'.format(total=total)) log.info('Loading zones.msgpack') zones_filepath = tmp.path(prefix + '/zones.msgpack') if drop and GeoZone.objects.count(): name = '_'.join((GeoZone._get_collection_name(), ts)) target = GeoZone._get_collection_name() with switch_collection(GeoZone, name): with handle_error(prefix, GeoZone): total = load_zones(GeoZone, zones_filepath) GeoZone.objects._collection.rename(target, dropTarget=True) else: with handle_error(prefix): total = load_zones(GeoZone, zones_filepath) log.info('Loaded {total} zones'.format(total=total)) cleanup(prefix)
def get_data(ID): with switch_collection(Stock, 'TRD_old') as StockS: data = StockS.objects(Stkcd=ID).all() data_temp = [[ x['Opnprc'], x['Hiprc'], x['Loprc'], x['Clsprc'], x['Dnshrtrd'], x['Dnvaltrd'], x['Dsmvosd'], x['Dsmvtll'], x['Dretwd'], x['Dretnd'], x['Adjprcwd'], x['Adjprcnd'], x['Markettype'], x['Trdsta'] ] for x in data] data_form = form_data(data_temp) """ data_form = form_data([[x['Opnprc'], x['Hiprc'], x['Loprc'], x['Clsprc']] for x in data]) """ x_train = [] y_train = [] length = len(data_form) for i in range(length): if i < length - 10: temp_time = [] for j in range(5): temp_time.append(data_form[i + j]) x_train.append(temp_time) # if data_temp[i+5][3] < data_temp[i+5][0]: if (data_temp[i + 0][3] + data_temp[i + 1][3] + data_temp[i + 2][3] + data_temp[i + 3][3] + data_temp[i + 4][3]) < ( data_temp[i + 5][3] + data_temp[i + 6][3] + data_temp[i + 7][3] + data_temp[i + 8][3] + data_temp[i + 9][3]): y_train.append(0) else: y_train.append(1) return x_train, y_train
def insert_security_code(self, market, file_name, path): database = DatabaseName.INDEX_KLINE_DAILY.value with MongoConnect(database): print(path + file_name + '\n') kline_daily_data = pd.read_csv(path + file_name, encoding='unicode_escape') code = file_name.split('.')[0] code_transfer_dict = {'999999': '000001', '999998': '000002', '999997': '000003', '999996': '000004', '999995': '000005', '999994': '000006', '999993': '000007', '999992': '000008', '999991': '000010', '999990': '000011', '999989': '000012', '999988': '000013', '999987': '000016', '999986': '000015', '000300': '000300'} if market == 'SH': if code in code_transfer_dict.keys(): code = code_transfer_dict[code] else: code = '00' + code[2:] security_code = code + '.' + market kline_daily_data = kline_daily_data.reindex(columns=['date', 'open', 'high', 'low', 'close', 'volumw', 'turover', 'match_items', 'interest']) kline_daily_data.rename(columns={'volumw': 'volume', 'turover': 'amount'}, inplace=True) with switch_collection(Kline, security_code) as KlineDaily_security_code: doc_list = [] for index, row in kline_daily_data.iterrows(): date_int = int(row['date']) if not np.isnan(date_int): date_int = str(date_int) time_tag = datetime.strptime(date_int, "%Y%m%d") doc = KlineDaily_security_code(time_tag=time_tag, pre_close=None, open=int(row['open']), high=int(row['high']), low=int(row['low']), close=int(row['close']), volume=int(row['volume']), amount=int(row['amount']), match_items=int(row['match_items']), interest=int(row['interest'])) doc_list.append(doc) KlineDaily_security_code.objects.insert(doc_list)
def get_alignment_summary(self): with switch_collection(Alignment, self.alignment_collection) as A: A.ensure_indexes() res = A.objects.aggregate( { '$sort': { 'query_id': 1, 'total_score_rescaled': 1} }, { '$group': { '_id': '$query_id', 'aln_count': { '$sum': 1}, 'best_score': {'$first': '$total_score_rescaled'}, 'best_m_score': {'$first': "$m_score"}, 'best_query_miss_rate': {'$first': '$query_miss_rate'}, 'best_ref_miss_rate': {'$first': '$ref_miss_rate'}, 'best_query_scaling_factor' : {'$first': '$query_scaling_factor'} } }, allowDiskUse = True); def fix_d(d): # Remap _id to query_id d['query_id'] = d['_id'] del d['_id'] return d return [fix_d(d) for d in res]
def density_near_query(output_path, dataset: DatasetEnum, ids_file): with open(ids_file, 'r') as f: ids = json.load(f) ds_ids = ids[dataset.value] density_records = [] for photo_id in tqdm(ds_ids): q_photo = FlickrPhoto.objects(photo_id=photo_id).first() coords = q_photo.geo.coordinates thresholds = { "density_10m": 0.01, "density_100m": 0.1, "density_500m": 0.5, "density_1km": 1, } DensityRecord = make_dataclass("DensityRecord", [("photo_id", int), ("density_10m", int), ("density_100m", int), ("density_500m", int), ("density_1km", int)]) densities = dict() with switch_collection(FlickrPhoto, "flickr.db1") as DbFlickrPhoto: for density_type, radius in thresholds.items(): density = DbFlickrPhoto.count_photos_in_radius(coords, radius) densities[density_type] = density density_records.append(DensityRecord(photo_id, **densities)) df = pd.DataFrame(density_records) df.to_csv(output_path, index=False)
def get_histories(self, cname, **kwargs): cn = self.get_collection_name(cname) with switch_collection(History, cn) as _history: cursor = _history.objects.filter(**kwargs) result = cursor.order_by('-time') return result
def update_index_data(self, end=datetime.now()): """ :param end: :return: """ get_collection_list = GetCollectionList() index_list = get_collection_list.get_index_list() self.end = end database = DatabaseName.INDEX_KLINE_DAILY.value with MongoConnect(database): index_data_dict = {} for index_code in index_list: with switch_collection(Kline, index_code) as KlineDaily_index_code: security_code_data = KlineDaily_index_code.objects(time_tag__lte=self.end).as_pymongo() security_code_data_df = pd.DataFrame(list(security_code_data)).reindex(columns=self.field) security_code_data_df.set_index(["time_tag"], inplace=True) index_data_dict[index_code] = security_code_data_df field_data_dict = {} for i in self.field: if i != 'time_tag': field_data_pd = pd.DataFrame({key: value[i] for key, value in index_data_dict.items()}) # 原始数据的开高低收除以10000 if i in ['open', 'high', 'low', 'close']: field_data_dict[i] = field_data_pd.div(10000) else: field_data_dict[i] = field_data_pd folder_name = LocalDataFolderName.MARKET_DATA.value sub_folder_name = LocalDataFolderName.KLINE_DAILY.value sub_sub_folder_name = LocalDataFolderName.INDEX.value for field in self.field: if field not in ['time_tag', 'interest']: path = LocalDataPath.path + folder_name + '/' + sub_folder_name + '/' + sub_sub_folder_name + '/' data_name = field save_data_to_hdf5(path, data_name, pd.DataFrame(field_data_dict[field]))
def get_summary(self): """Return a summary of this experiment """ ret = {} ret['name'] = self.name ret['description'] = self.description ret['created'] = self.created ret['num_query_maps'] = len(self.get_query_map_ids()) ret['num_ref_maps'] = len(self.get_ref_map_ids()) with switch_collection(Alignment, self.alignment_collection) as A: A.ensure_indexes() ret['num_alignments'] = A.objects.count() # # Get the number of alignments per query # alignments = A.objects.only('query_id') # query_id_counts = Counter(a.query_id for a in alignments) # aligned_queries = [{"query_id" : query_id, # "aln_count": aln_count} for query_id, aln_count in query_id_counts.iteritems()] # ret['aligned_queries'] = aligned_queries ret['aligned_queries'] = self.get_alignment_summary() # Get a list of query_ids with the number of alignments for each query. # Mongoengine does not have good aggregation support so do it here in memory. return ret
def save_tick_data(self, ticks: List[TickData], collection_name: str = None) -> bool: """""" for tick in ticks: tick.datetime = convert_tz(tick.datetime) d = tick.__dict__ d["exchange"] = d["exchange"].value d["interval"] = d["interval"].value d.pop("gateway_name") d.pop("vt_symbol") param = to_update_param(d) if not collection_name: DbTickData.objects( symbol=d["symbol"], exchange=d["exchange"], datetime=d["datetime"], ).update_one(upsert=True, **param) else: with switch_collection(DbTickData, collection_name): DbTickData.objects( symbol=d["symbol"], exchange=d["exchange"], datetime=d["datetime"], ).update_one(upsert=True, **param)
def post_data(self, data: typing.Union[typing.MutableMapping[str, str], typing.List[typing.MutableMapping[str, str]]]): def create_document(row: typing.MutableMapping[str, str]): kwargs = {key: _type_convert(val) for key, val in row.items()} # Can't store field 'id' in document - rename it if 'id' in kwargs: kwargs[self.id_field_alias] = kwargs.pop('id') return kwargs # Put data in collection belonging to this data source with context_managers.switch_collection(CsvRow, self.location) as collection: collection = collection._get_collection() try: # Data is a dictionary - a single row collection.insert_one(create_document(data)) except AttributeError: # Data is a list of dictionaries - multiple rows documents = (create_document(row) for row in data) collection.insert_many(documents)
def add_dialog_one(self, uurl, ddialog, col_name): save_url = uurl save_dialog = ddialog dialog_obj = Dialog(url=save_url, dialog=save_dialog) # 这里是对 集合 Collection 进行选择 with switch_collection(Dialog, col_name): return dialog_obj.save()
def get_response(self, params: typing.Optional[typing.Mapping[str, str]] = None): # TODO accept parameters provided twice as an inclusive OR if params is None: params = {} params = {key: _type_convert(val) for key, val in params.items()} with context_managers.switch_collection(CsvRow, self.location) as collection: records = collection.objects.filter(**params).exclude('_id') data = list(records.as_pymongo()) # Couldn't store field 'id' in document - recover it for item in data: try: item['id'] = item.pop(self.id_field_alias) except KeyError: pass return JsonResponse({ 'status': 'success', 'data': data, })
def switch_collection(self, collection_name, keep_created=True): """ Temporarily switch the collection for a document instance. Only really useful for archiving off data and calling `save()`:: user = User.objects.get(id=user_id) user.switch_collection('old-users') user.save() :param str collection_name: The database alias to use for saving the document :param bool keep_created: keep self._created value after switching collection, else is reset to True .. seealso:: Use :class:`~mongoengine.context_managers.switch_db` if you need to read from another database """ with switch_collection(self.__class__, collection_name) as cls: collection = cls._get_collection() self._get_collection = lambda: collection self._collection = collection self._created = True if not keep_created else self._created self.__objects = self._qs self.__objects._collection_obj = collection return self
def log_cleanplus(): # Log cleanplus DB entry with switch_collection(Cleanplus, __colcleanplus) as CCleanplus: num_saved, num_unsaved = 0, 0 unsaved = [] for c in cleanplus: data = CCleanplus( service = c['service'], year = c['year'], department = c['department'], team = c['team'], start_date = c['start_date'], end_date = c['end_date'], budget_summary = c['budget_summary'], budget_assigned = c['budget_assigned'], budget_current = c['budget_assigned'], budget_contract = c['budget_contract'], budget_spent = c['budget_spent'] ) try: data.save() num_saved += 1 except: unsaved.append(c) num_unsaved += 1 with open('error/unsaved_cleanplus.json', 'w') as f: json.dump(unsaved, f) print "CLEANPLUS: Logged", num_saved, "items,", num_unsaved, "unsaved items, total:", num_saved+num_unsaved
def load_tick_data( self, symbol: str, exchange: Exchange, start: datetime, end: datetime, collection_name: str = None, ) -> Sequence[TickData]: if collection_name is None: s = DbTickData.objects( symbol=symbol, exchange=exchange.value, datetime__gte=start, datetime__lte=end, ) else: with switch_collection(DbTickData, collection_name): s = DbTickData.objects( symbol=symbol, exchange=exchange.value, datetime__gte=start, datetime__lte=end, ) data = [db_tick.to_tick() for db_tick in s] return data
def load_bar_data( self, symbol: str, exchange: Exchange, interval: Interval, start: datetime, end: datetime, collection_name: str = None, ) -> Sequence[BarData]: if collection_name is None: s = DbBarData.objects( symbol=symbol, exchange=exchange.value, interval=interval.value, datetime__gte=start, datetime__lte=end, ) else: with switch_collection(DbBarData, collection_name): s = DbBarData.objects( symbol=symbol, exchange=exchange.value, interval=interval.value, datetime__gte=start, datetime__lte=end, ) data = [db_bar.to_bar() for db_bar in s] return data
def load_tick_data( self, symbol: str, exchange: Exchange, start: datetime, end: datetime, collection_name: str = None, ) -> List[TickData]: """""" if not collection_name: s: QuerySet = DbTickData.objects( symbol=symbol, exchange=exchange.value, datetime__gte=convert_tz(start), datetime__lte=convert_tz(end), ) else: with switch_collection(DbBarData, collection_name): s: QuerySet = DbTickData.objects( symbol=symbol, exchange=exchange.value, datetime__gte=convert_tz(start), datetime__lte=convert_tz(end), ) vt_symbol = f"{symbol}.{exchange.value}" ticks: List[TickData] = [] for db_tick in s: db_tick.datetime = DB_TZ.localize(db_tick.datetime) db_tick.exchange = Exchange(db_tick.exchange) db_tick.gateway_name = "DB" db_tick.vt_symbol = vt_symbol ticks.append(db_tick) return ticks
def load_user(username): with switch_collection(User, 'users') as toGet: user = User.objects.get(username__exact=username) if not user: return None return user
def process_one_path(path, cover_mode, system_mode): ''' '对传入的目录进行处理,将目录及目录下的所有文件和子目录的元数据都加入到db中 ''' global recursive_flag global Mybucket recursive_flag = recursive_flag + 1 # 递归层级加1 if recursive_flag == 1: if isDirExists(path) == False: # 若目录在数据库中不存在 with switch_collection(Mybucket, collection_name) as Mybucket: Mybucket(na=path, fod=False).save() # 创建对象并添加到db files = os.listdir(path) # 罗列出目录下所有的子目录和文件 for file in files: if file in break_names: # 若是跳过文件 continue if path == '/': #若是linux系统的根目录'/' current_path = path + file else: current_path = path + '/' + file if isSysOrHide(system_mode, file, current_path): # 若是系统文件或者隐藏文件 continue if os.path.isdir(current_path): # 若是目录 if not isDirExists(current_path): # 若该目录在db中不存在 if isDirExists(path): #判断其父目录是否存在,若存在 parentId = getDirId(path) # 获取父目录的id with switch_collection(Mybucket, collection_name) as Mybucket: Mybucket(na=current_path, fod=False, did=parentId).save() # 创建对象并添加到db else: print("Error: no parent path") sys.exit() process_one_path(current_path, cover_mode, system_mode) # 继续递归地处理子目录下的文件与目录 elif os.path.isfile(current_path): # 若是普通文件 process_one_file(current_path, path, file, cover_mode) # 对文件进行处理 else: print("Warning: ", current_path, " is not a file or path") recursive_flag = recursive_flag - 1 # 该层递归结束,层级减1
def log_budgetspider(): # Clear previous unmatched record with open("error/unmatched.tsv", 'w') as f: pass # Small opengov objects sopengov = [] for i in opengov: sopengov.append(i['name']) # Binary search def bsearch(a, x, lo=0, hi=None): hi = hi or len(a) pos = bisect_left(a, x, lo, hi) return (pos if pos != hi and a[pos] == x else -1) # Log budgetspider DB entry with switch_collection(Budgetspider, __colbudgetspider) as CBudgetspider: num_unmatched, num_saved, num_unsaved, num_past = 0, 0, 0, 0 unmatched, unsaved = [], [] for c in cleanplus: if int(c['year']) < 2010: num_past += 1 continue search_idx = bsearch(sopengov, c['service']) if search_idx != -1 and c['service'] == opengov[search_idx]['name']: data = CBudgetspider( service = c['service'], year = c['year'], start_date = c['start_date'], end_date = c['end_date'], department = c['department'], team = c['team'], category_one = opengov[search_idx]['level_one'], category_two = opengov[search_idx]['level_two'], category_three = opengov[search_idx]['level_three'], budget_summary = c['budget_summary'], budget_assigned = c['budget_assigned'], budget_current = c['budget_current'], budget_contract = c['budget_contract'], budget_spent = c['budget_spent'] ) try: data.save() num_saved += 1 except: num_unsaved += 1 unsaved.append((search_idx, c, opengov[search_idx])) else: with open("error/unmatched.tsv", 'a') as f: err = "\t".join((c['service'], c['year'], c['department'], c['team'], c['budget_summary'])).encode('utf-8') f.write(err + '\n') unmatched.append(c) num_unmatched += 1 with open("error/unsaved_budgetspider.json", 'w') as f: json.dump(unsaved, f) with open("error/unmatched_budgetspider.json", 'w') as f: json.dump(unmatched, f) print "BUDGETSPIDER: Logged", num_saved, "items,", num_unsaved, "unsaved items,", num_unmatched, "unmatched items,", num_past, "2008-09 data, total:", num_saved+num_unsaved+num_unmatched+num_past
def last_entry(coll, site): with switch_collection(Article, coll) as article: data = article.objects(site=site).limit(1).order_by("-date") if data: date = data[0].date else: date = dt.datetime.utcnow() - dt.timedelta(days=7) return date
def switch_collection(self, cls): """ Switches to the chosen collection using Mongoengine's switch_collection. """ if self.collection: with switch_collection(cls, self.collection) as new_cls: yield new_cls else: yield cls
def _get_data_with_process_pool(self, database, security_list, process_manager_dict, security_list_i): with MongoConnect(database): thread_data_dict = {} for stock in security_list: with switch_collection(Kline, stock) as KlineDaily_security_code: security_code_data = KlineDaily_security_code.objects(time_tag__lte=self.end).as_pymongo() security_code_data_df = pd.DataFrame(list(security_code_data)).reindex(columns=self.field) security_code_data_df.set_index(["time_tag"], inplace=True) thread_data_dict[stock] = security_code_data_df.reindex(self.calendar_SZ).fillna(method='ffill') process_manager_dict[security_list_i] = thread_data_dict
def save_articles(coll, articles): if len(articles) < 1: return with switch_collection(Article, coll): for a in articles: try: a.save() except NotUniqueError: LOGGER[coll].error("Duplicate article: %s." % a.title)
def get_data_emotion(ID): with switch_collection(Stock, 'TRD_old') as StockS: data = StockS.objects(Stkcd=ID).all() date = [x['Trddt'] for x in data] data_temp = [[ x['Opnprc'], x['Hiprc'], x['Loprc'], x['Clsprc'], x['Dnshrtrd'], x['Dnvaltrd'], x['Dsmvosd'], x['Dsmvtll'], x['Dretwd'], x['Dretnd'], x['Adjprcwd'], x['Adjprcnd'], x['Markettype'], x['Trdsta'] ] for x in data] data_form = form_data(data_temp) """ data_form = form_data([[x['Opnprc'], x['Hiprc'], x['Loprc'], x['Clsprc']] for x in data]) """ x_train = [] y_train = [] length = len(data_form) with switch_collection(Rumor, 'TRD_rumor') as RumorS: rumor = RumorS.objects(Stkcd=ID).all() rumor_dict = {} for item in rumor: rumor_dict.setdefault(item['Qdate'].replace('/', '-'), item['QLabel']) for i in range(length): if i < length - 5: temp_time = [] for j in range(5): daily = copy.deepcopy(data_form[i + j]) if date[i + j] in rumor_dict: print('11111111') if rumor_dict[date[i + j]]: daily.append(1) else: daily.append(-1) else: daily.append(0) temp_time.append(daily) x_train.append(temp_time) if data_temp[i + 5][3] < data_temp[i + 5][0]: y_train.append(0) else: y_train.append(1) return x_train, y_train
def isDirExists(dir_path): ''' '判断dir_path的元数据是否在数据库中已经存在 '若存在,则返回True,不存在,则返回False ''' global Mybucket with switch_collection(Mybucket, collection_name) as Mybucket: if Mybucket.objects(na=dir_path).count() > 0: # 根据目录路径进行查询 return True else: return False
def returnSites(): username = session['user_id'] with switch_collection(User, 'users') as toGet: userObj = User.objects.get(username__exact=username) return jsonify( list( map( lambda site: { "content": site.content.hex(), "id": site.id }, userObj.sites)))
def addsites(id): with switch_collection(User, 'users') as toAdd: user = User.objects.get(username__exact=session['user_id']) info = SiteInfo(id=id, content=request.get_data()) updated = User.objects( id=user.id, sites__id=id).update(set__sites__S__content=info.content) if not updated: User.objects(id=user.id).update_one(push__sites=info) user.save(validate=True) return jsonify({"success": "updated" if updated else "new"})
def add_problem_one(self, data, col_name): doctor_name = data.split('#')[0] hospital = data.split('#')[1] date = data.split('#')[2] url = data.split('#')[3] problem_obj = Problem(doctor=doctor_name, hospital=hospital, date=date, url=url) # 这里是对 集合 Collection 进行选择 with switch_collection(Problem, col_name): return problem_obj.save()
def insert_security_code(self, market, file_name, path): with MongoConnect(self.database): print(path + file_name + '\n') kline_daily_data = pd.read_csv(path + file_name, encoding='unicode_escape') security_code = file_name.split('.')[0] + '.' + market if is_security_type(security_code, 'EXTRA_STOCK_A'): kline_daily_data = kline_daily_data.reindex(columns=['date', 'open', 'high', 'low', 'close', 'volumw', 'turover', 'match_items', 'interest']) kline_daily_data.rename(columns={'volumw': 'volume', 'turover': 'amount'}, inplace=True) kline_daily_data = kline_daily_data[kline_daily_data.date >= 20020104] with switch_collection(Kline, security_code) as KlineDaily_security_code: doc_list = [] security_code_data = pd.DataFrame() if security_code in self.data_dict.keys(): security_code_data = self.data_dict[security_code].set_index(["TRADE_DT"]) security_code_data = security_code_data.fillna(0) for index, row in kline_daily_data.iterrows(): date_int = int(row['date']) if not np.isnan(date_int): try: pre_close = int(10000 * security_code_data.loc[date_int, 'S_DQ_PRECLOSE']) except KeyError: pre_close = None date_int = str(date_int) time_tag = datetime.strptime(date_int, "%Y%m%d") doc = KlineDaily_security_code(time_tag=time_tag, pre_close=pre_close, open=int(row['open']), high=int(row['high']), low=int(row['low']), close=int(row['close']), volume=int(row['volume']), amount=int(row['amount']), match_items=int(row['match_items']), interest=int(row['interest'])) doc_list.append(doc) # 用csv全表补充20020104之前的日线数据,match_items为0 security_code_data = security_code_data[security_code_data.index < 20020104] for index, row in security_code_data.iterrows(): if row['S_DQ_AMOUNT'] > 0: date_int = int(index) date_int = str(date_int) time_tag = datetime.strptime(date_int, "%Y%m%d") try: pre_close = int(row['S_DQ_PRECLOSE'] * 10000) except KeyError: pre_close = None doc = KlineDaily_security_code(time_tag=time_tag, pre_close=pre_close, open=int(row['S_DQ_OPEN'] * 10000), high=int(row['S_DQ_HIGH'] * 10000), low=int(row['S_DQ_LOW'] * 10000), close=int(row['S_DQ_CLOSE'] * 10000), volume=int(row['S_DQ_VOLUME'] * 100), amount=int(row['S_DQ_AMOUNT'] * 1000), match_items=0, interest=0) doc_list.append(doc) KlineDaily_security_code.objects.insert(doc_list)
def isFileExists(filename, dirId): ''' '判断当前文件的元数据是否在数据库中已经存在 '若存在,则返回True,不存在,则返回False ''' global Mybucket with switch_collection(Mybucket, collection_name) as Mybucket: if Mybucket.objects(Q(na=filename) & Q(did=dirId) & Q(sds=False)).count() > 0: # 根据文件名以及文件所在路径进行查询 return True else: return False
def find_service(_service_name): print "budgetspider" with switch_collection(Budgetspider, __colbudgetspider) as CBudgetspider: for i in CBudgetspider.objects.all(): if utf8(re.sub("[~*()'\". -]", "", i["service"])) == utf8(re.sub("[~*()'\". -]", "", _service_name)): for pr in map(utf8, (i["service"], i["year"], i["department"], i["category_one"], i["category_two"])): print pr, print print "opengov" with switch_collection(Opengov, __colopengov) as COpengov: for i in COpengov.objects(): if utf8(re.sub("[~*()'\". -]", "", i["name"])) == utf8(re.sub("[~*()'\". -]", "", _service_name)): for pr in map(utf8, (i["name"], i["level_one"], u["level_two"])): print pr, print print "cleanplus" with switch_collection(Cleanplus, __colcleanplus) as CCleanplus: for i in CCleanplus.objects(service=_service_name): if utf8(re.sub("[~*()'\". -]", "", i["service"])) == utf8(re.sub("[~*()'\". -]", "", _service_name)): for pr in map(utf8, (i["service"], i["year"], i["department"])): print pr,
def insert(): error = None # Generate password hash, 12 rounds pwHash = bcrypt.generate_password_hash(request.form['password']) newUser = User(username=request.form['username'], email=request.form['email'], password=pwHash) with switch_collection(User, 'users') as toGet: try: if User.objects.get(username__exact=str(request.form['username'])): raise BadRequest('Registration Error, please try again.') except DoesNotExist: with switch_collection(User, 'users') as toAdd: newUser.secretKey = pyotp.random_base32() newUser.save(validate=True) totp = pyotp.TOTP(newUser.secretKey) uri = totp.provisioning_uri(request.form['email'], issuer_name='Kepyer.pro') session['verify'] = newUser.username # totp uri, can be used to generate QR code return uri
def update_unit_statuses(): """ Update unit statuses to reference the new symptom collection """ from dcmetrometrics.eles import models from mongoengine.context_managers import switch_db, switch_collection d2r = dict() # Symptom description to record for s in models.SymptomCode.objects: d2r[s.description] = s # Fish out UnitStatus in the old format from the symptom_code collection # Backup to the symptom_code_old collection print """Exporting from collection escalator_statuses, assuming records are in the old format. If successful, will backup to collection escalator_statuses_old...""" try: with switch_collection(models.UnitStatusOld, "escalator_statuses") as UnitStatusOld: n = models.UnitStatusOld.objects.count() for i, s in enumerate(models.UnitStatusOld.objects): # Make a backup of the old unit statuses print "Backing up unit status %i of %i (%.2f %%)"%(i, n, float(i)/n*100.0) s.switch_collection('escalator_statuses_old') s.save() # Save to the new collection except Exception as e: print "Caught Exception!\n" print str(e) return # Save unit statuses in the new format. n = models.UnitStatusOld.objects.count() for i, s_old in enumerate(models.UnitStatusOld.objects): print 'Reformatting unit status %i of %i (%.2f %%)'%(i, n, float(i)/n*100.0) s_new = s_old.to_new_format() s_new.pk = s_old.pk s_new.symptom = d2r[s_old.symptom.description] s_new.save()
def log_opengov(): # Log opengov DB entry with switch_collection(Opengov, __colopengov) as COpengov: num_saved, num_unsaved = 0, 0 unsaved = [] for o in opengov: data = COpengov( service = o['name'], category_one = o['level_one'], category_two = o['level_two'], category_three = o['level_three'] ) try: data.save() num_saved += 1 except: unsaved.append(o) num_unsaved += 1 with open('error/unsaved_opengov.json', 'w') as f: json.dump(unsaved, f) print "OPENGOV: Logged", num_saved, "items,", num_unsaved, "unsaved items, total:", num_saved+num_unsaved
def switch_collection(self, collection_name): """ Temporarily switch the collection for a document instance. Only really useful for archiving off data and calling `save()`:: user = User.objects.get(id=user_id) user.switch_collection('old-users') user.save() If you need to read from another database see :class:`~mongoengine.context_managers.switch_db` :param collection_name: The database alias to use for saving the document """ with switch_collection(self.__class__, collection_name) as cls: collection = cls._get_collection() self._get_collection = lambda: collection self._collection = collection self._created = True self.__objects = self._qs self.__objects._collection_obj = collection return self
def get_query_map_ids(self): """Get a list of query ids""" # Select a list of distinct query_id's from the alignments with switch_collection(Map, self.map_collection) as M: M.ensure_indexes() return M.objects.filter(type = 'query').distinct('name')
def get_ref_maps(self): """Get a list of ref ids""" # Select a list of distinct query_id's from the alignments with switch_collection(Map, self.map_collection) as M: M.ensure_indexes() return M.objects.filter(type = 'reference')
def read_data_from_database_for___uid_or_uname_list(): uid_or_uname_list = [] this_uid_list = [] this_nickname_list = [] weibo_collection_name = [] # weibo_collection_name = ["zhuanjiyin_nohashtag_original_2014_03_01_to_2014_03_10_detmine_1", \ # "zhuanjiyin_nohashtag_original_2014_03_10_to_2014_03_20_detmine_2", \ # "zhuanjiyin_nohashtag_original_2014_03_20_to_2014_04_01_detmine_3"] # 处理微博中的用户信息 print "start single weibo" global Single_weibo_with_more_info_store for one_collection in weibo_collection_name: with switch_collection(Single_weibo_with_more_info_store, one_collection) as Single_weibo_with_more_info_store: for one_weibo in Single_weibo_with_more_info_store.objects: this_uid_list.append(one_weibo["uid"]) this_uid_list.append(one_weibo["come_from_user_id"]) this_nickname_list.extend(chuli_at_info(one_weibo["at_info"])) this_nickname_list.extend(chuli_at_info(one_weibo["retweet_reason_at_info"])) # 处理 comment 中的用户信息 # 'zhuanjiyin_nohashtag_original_single_comment_2016_with_more_info' print "start comment" comment_collections = [] # comment_collections.append('zhuanjiyin_nohashtag_original_single_comment_2014_with_more_info_repair') global Single_comment_store for one_collection in comment_collections: with switch_collection(Single_comment_store, one_collection) as Single_comment_store: for one_comment in Single_comment_store.objects: this_uid_list.append(one_comment["uid"]) this_nickname_list.extend(chuli_at_info(one_comment["at_info"])) print "start repost" repost_collections = [] repost_collections.append("zhuanjiyin_nohashtag_original_single_repost_2016_with_more_info_repair") global Single_repost_store for one_collection in repost_collections: with switch_collection(Single_repost_store, one_collection) as Single_repost_store: for one_comment in Single_repost_store.objects: this_uid_list.append(one_comment["uid"]) this_nickname_list.extend(chuli_at_info(one_comment["at_info"])) uid_or_uname_list.extend(list(set(this_uid_list))) uid_or_uname_list.extend(list(set(this_nickname_list))) uid_or_uname_list = list(set(uid_or_uname_list)) # print "start filter" # for uid_or_nickname in set(this_uid_list): # if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) == 0 or\ # len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) == 0: # uid_or_uname_list.append(uid_or_nickname) # # for uid_or_nickname in set(this_nickname_list) : # if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) == 0 or\ # len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) == 0: # uid_or_uname_list.append(uid_or_nickname) random.shuffle(uid_or_uname_list) print len(uid_or_uname_list) return uid_or_uname_list
def get_maps(self): with switch_collection(Map, self.map_collection) as M: M.ensure_indexes() return M.objects
par.add_argument("dest") par.add_argument("--src_uri", type=str, default = "mongodb://127.0.0.1/test") par.add_argument("--dest_uri", type=str, default = "mongodb://127.0.0.1/test") par.add_argument("--init", type=str, default = "") par.add_argument("--query", type=str, default = "{}") par.add_argument("--key", type=str, default = "_id") par.add_argument("--verbose", type=int, default = 0) config = par.parse_args() query = json.loads(config.query) sys.path.insert(0, config.module_abspath) module = importlib.import_module(config.module) cb = getattr(module, config.function) init = getattr(module, config.init) if config.init else None source_db = pymongo.MongoClient(config.src_uri).get_default_database() source = source_db[config.source] dest_db = pymongo.MongoClient(config.dest_uri).get_default_database() dest = dest_db[config.dest] connectMongoEngine(dest) hk_colname = source.name + '_' + dest.name switch_collection(housekeep, hk_colname).__enter__() # print "DEBUG start worker", os.getpid() do_chunks(init, cb, source, dest, query, config.key, config.verbose) # print "DEBUG end worker", os.getpid()
def get_alignments(self): with switch_collection(Alignment, self.alignment_collection) as A: # A.ensure_indexes() return A.objects
def ensure_related_indexes(self): with switch_collection(Alignment, self.alignment_collection) as A: A.ensure_indexes() with switch_collection(Map, self.map_collection) as M: M.ensure_indexes()
def get_service(): with switch_collection(Budgetspider, __colbudgetspider) as CBudgetspider: for i in CBudgetspider.objects(year="2013", category_one="일반공공행정", category_two="재정금융"): print utf8(i["category_three"]), utf8(i["service"]), i["budget_assigned"]
def calc_sum(): years = ["2014"] with switch_collection(Budgetspider, __colbudgetspider) as CBudgetspider: for y in years: total_assigned = 0 total_summary = 0 categories = [] budgets_assigned = [] budgets_summary = [] services = [] for i in CBudgetspider.objects(year=y): name = (i["category_one"], i["category_two"], i["category_three"]) if name in categories: budgets_assigned[categories.index(name)] += i["budget_assigned"] budgets_summary[categories.index(name)] += i["budget_summary"] total_assigned += i["budget_assigned"] total_summary += i["budget_summary"] else: categories.append(name) budgets_assigned.append(i["budget_assigned"]) budgets_summary.append(i["budget_summary"]) total_assigned += i["budget_assigned"] total_summary += i["budget_summary"] with open("output/category_three_" + y + ".tsv", "w") as f: f.write( "\t".join( ( utf8("category_one"), utf8("category_two"), utf8("category_three"), utf8("assigned"), utf8("summary"), utf8("num_services"), ) ) + "\n" ) f.close() with open("output/services_" + y + ".tsv", "w") as f: f.write( "\t".join( ( utf8("service"), utf8("category_one"), utf8("category_two"), utf8("category_three"), utf8("assigned"), utf8("summary"), ) ) + "n" ) f.close() for i in range(len(categories)): if not categories[i] == None: with open("output/category_three_" + y + ".tsv", "a") as f: num_services = CBudgetspider.objects( year=y, category_one=utf8(categories[i][0]), category_two=utf8(categories[i][1]), category_three=utf8(categories[i][2]), ).count() f.write( "\t".join( ( utf8(categories[i][0]), utf8(categories[i][1]), utf8(categories[i][2]), str(budgets_assigned[i]), str(budgets_summary[i]), str(num_services), ) ) + "\n" ) with open("output/services_" + y + ".tsv", "a") as f: for b in CBudgetspider.objects( year=y, category_one=utf8(categories[i][0]), category_two=utf8(categories[i][1]), category_three=utf8(categories[i][2]), ): f.write( "\t".join( ( utf8(b["service"]), utf8(categories[i][0]), utf8(categories[i][1]), utf8(categories[i][2]), str(b["budget_assigned"]), str(b["budget_summary"]), ) ) + "\n" ) print "TOTAL", y, total_assigned, total_summary, len(CBudgetspider.objects(year=y)), "services"