def _recreate_user_timeline(self, consumer_id): """ for when (server restarts, or a new user logs in) :return: True on success """ content = (self._relations.select( self._dataset.item_id, self._dataset.timestamp).join( self._dataset, on=(self._relations.producer_id == self._dataset.producer_id )).where( self._relations.consumer_id == consumer_id).order_by( self._dataset.timestamp.desc()).limit( self._max_cache).namedtuples()) consumer_feed = self.create_cache_name(consumer_id) for chunk in chunked(content, 400): redis.zadd(consumer_feed, dict((c.item_id, c.timestamp) for c in chunk)) if not self._include_actor: return True content = (self._dataset.select( self._dataset.item_id, self._dataset.timestamp).where( self._dataset.producer_id == consumer_id)) pipe = redis.pipeline() for chunk in chunked(content, 400): pipe.zadd(consumer_id, dict( (c.item_id, c.timestamp) for c in chunk)) pipe.execute() return True
def extract(self, gen: Generator): """ Saves the content of the gen to DB. gen is a DomainIntel Object. :param gen: DomainIntel generator object """ temp = [] while True: try: item = next(gen) except StopIteration: break else: if MalwareDomains.select().where(MalwareDomains.domain == item.domain).exists(): continue if len(temp) == self.CHUNK_SIZE: with db.atomic(): for batch in chunked(temp, 200): temp = [{"domain": malware_domain.domain, "type": malware_domain.type, "reference": malware_domain.reference} for malware_domain in batch] MalwareDomains.insert_many(temp).execute() temp.clear() temp.append(item) if temp: with db.atomic(): for batch in chunked(temp, 200): temp = [{"domain": malware_domain.domain, "type": malware_domain.type, "reference": malware_domain.reference} for malware_domain in batch] MalwareDomains.insert_many(temp).execute()
def db_consume_data(files: List[str]): rows = [] tables = defaultdict(list) db_connect() db_drop_tables() db_create_tables() with profiler(f"Read files: {files}"): for f in files: def pool_init(pid): global PID; PID=pid # Ugly reinitialisation of the pool due to PID value propagation pool = multiprocessing.Pool(PROC_NR, pool_init, (len(f),)) rows.extend(filter(None, fd_consume_data(f, pool))) for k,v in rows: tables[k].append(v) with tqdm(total=len(rows), desc="Insert records") as t: with profiler("Write to db"): for k in tables.keys(): with profiler(f" {k}/{len(tables[k])}"): with DB.atomic(): for batch in chunked(tables[k], DBBATCH): globals()[k].insert_many(batch).execute() t.update(len(batch)) db_close()
def etl_music_bands(db, *, chunk_size: int = 512, limit: Optional[int] = None) -> int: """ ETL music bands into our databases. :param db: The database in which to ETL the bands. :param chunk_size: The number of records in each insert batch. :param limit: The maximum number of records to fetch from wikidata. This is included for debugging purpose. :return: The number of records that were ETLed. """ query = """ SELECT DISTINCT ?wikidata_id ?name WHERE { ?wikidata_id wdt:P31/wdt:P279* wd:Q2088357. ?wikidata_id rdfs:label ?name. FILTER (LANG(?name) = 'en') } """ if limit: query = f"{query}\nLIMIT {limit}" bands = query_wikidata(query) with db.atomic(): for batch in chunked(bands, _PEEWEE_CHUNK_SIZE): MusicBand.insert_many(batch).execute() return len(bands)
def import_probes(product, probes): data = [] for probe in probes: definition = probe.definition description = definition["description"] data.append({ "product": product, "name": probe.name, "type": probe.type, "description": description, "definition": definition, "index": fn.to_tsvector("english", " ".join([probe.name, description])), }) with db.atomic(): for batch in chunked(data, 100): (Probes.insert_many(batch).on_conflict( conflict_target=[Probes.product, Probes.name], update={ Probes.description: EXCLUDED.description, Probes.definition: EXCLUDED.definition, Probes.index: EXCLUDED.index, }, ).execute()) log("Imported {n:,} probes for {product}".format(n=len(probes), product=product))
def create_sentences(db, sentence_file=const.SOMETHING_LABELS_FILENAME): sentences_list = word_processing.read_csv_label_sentences(sentence_file) sentence_dicts = [{'sentence': sentence} for sentence in sentences_list] with db.atomic(): for batch in chunked(sentence_dicts, 100): SentenceSQL.insert_many(batch).execute() return SentenceSQL.select()
def parse_petition(petitions): for petition in petitions: # get_petition_status(petition) data = [] petition_url = 'https://petition.president.gov.ua/petition/' + str(petition) pages = [petition_url + '/votes/' + str(i) for i in range(1, max_page(petition_url) + 1)] for page in pages: html = requests.get(page).text soup = BeautifulSoup(html, 'lxml') rows = soup.find_all('div', class_=re.compile(r'^table_row$')) for r in rows: position_number = r.find('div', class_='table_cell number').string.replace('.', '') username = r.find('div', class_='table_cell name').string day, month, year = r.find('div', class_='table_cell date').string.split(' ') new_month = m.parse(month)[0].inflect({'nomn'}).word.title() sign_date = datetime.strptime(' '.join([day, new_month, year]), '%d %B %Y') data.append((petition, position_number, username, sign_date)) if Vote.select().where(Vote.petition == petition): print(f'petition {petition} was in db with', Vote.delete().where(Vote.petition == petition).execute(), 'rows') with db.atomic(): # by default SQLite limits the number of bound variables in a SQL query to 999 for batch in chunked(data, 249): Vote.insert_many(batch, fields=['petition', 'position_number', 'username', 'sign_date']).execute() status = get_petition_status(petition) Petition.update(status=status).where(Petition.petition_id == petition).execute() if petitions: set_gender()
def save_all(objs: List["DbBarData"], progress_bar_dict=None): """ save a list of objects, update if exists. """ dicts = [i.to_dict() for i in objs] with db.atomic(): if driver is Driver.POSTGRESQL: for bar in dicts: DbBarData.insert(bar).on_conflict( update=bar, conflict_target=( DbBarData.symbol, DbBarData.exchange, DbBarData.interval, DbBarData.datetime, ), ).execute() else: total_sz = len(dicts) loaded = 0 for c in chunked(dicts, 50): DbBarData.insert_many(c).on_conflict_replace().execute() if 'save_progress_bar' in progress_bar_dict: loaded += 50 percent_saved = min(round(100 * loaded / total_sz, 2), 100) QApplication.processEvents() progress_bar_dict['save_progress_bar'].setValue(percent_saved) elif 'web_progress' in progress_bar_dict: loaded += 50 percent_saved = min(round(100 * loaded / total_sz, 2), 100) progress_bar_dict['web_progress'].write_message({'progress': percent_saved}) print(f"web progress: {percent_saved}")
def bootstrap_tasks(config): for (name, tasks) in config["tasks"].items(): TaskGroup.create(name=name) with db.atomic(): for batch in chunked(tasks, 100): Task.insert_many( [{"path": task, "group": name} for task in batch] ).execute()
def insert_many_safe(cls, models, batch_size=100): with cls._meta.database.atomic() as transaction: try: for batch in chunked(models, 100): cls.insert_many(models) except PeeweeException as pwex: transaction.rollback() print(pwex)
def batch_insert(cls, dict_list, nums=1000): """批量创建""" if not dict_list: return for batch in chunked(dict_list, nums): cls.insert_many(batch).execute() print 'insert {} rows:{}'.format(cls.__name__.lower(), len(dict_list)) return
def save_all(objs: List["DbBarData"]): """ save a list of objects, update if exists. """ dicts = [i.to_dict() for i in objs] with db.atomic(): for c in chunked(dicts, 60): DbBarData.insert_many(c).on_conflict_replace().execute()
def bootstrap_tasks(config): for (name, tasks) in config["tasks"].items(): TaskGroup.insert(name=name).on_conflict("ignore").execute() with db.atomic(): for batch in chunked(tasks, 100): query = Task.insert_many( [{"path": task, "group": name} for task in batch] ).on_conflict("ignore") query.execute()
def insert_data(self, data, table_class): """ Insert data into database in chunks using peewee functions :param table_class: Class which should store the given date in database :param data: List of persons to be stored in database """ with self.db.atomic(): for batch in chunked(data, 5): table_class.insert_many(batch).execute()
def sqlite_format(items, filepath): db = SqliteDatabase(filepath.as_posix()) db_proxy.initialize(db) db.create_tables((Hotel, )) with db.atomic(): Hotel.delete().execute() for batch in chunked(items, 900): Hotel.insert_many(batch).execute()
def _generate_work(result): """ Fetch work in chunks, loop and yield one at a time for display. """ for work_set in chunked(result, 100): for work in work_set: yield work
def _save_selenium_redis(entries: typing.Union[Link, typing.List[Link]], single: bool = False, score: typing.Optional[float] = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`selenium` database. The function updates the ``queue_selenium`` database. Args: entries: Links to be added to the :mod:`selenium` database. It can be either an *iterable* of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is an *iterable* of links or a single link string. score: Score to for the Redis sorted set. nx: Forces ``ZADD`` to only create new elements and not to update scores for elements that already exist. xx: Forces ``ZADD`` to only update scores of elements that already exist. New elements will not be added. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. """ if not entries: return if score is None: score = time.time() if not single: if typing.TYPE_CHECKING: entries = typing.cast(typing.List[Link], entries) for chunk in peewee.chunked(entries, BULK_SIZE): pool = list(filter(lambda link: isinstance(link, Link), chunk)) for link in pool: _redis_command('set', link.name, pickle.dumps(link), nx=True) mapping = { link.name: score for link in pool } with _redis_get_lock('lock_queue_selenium'): # type: ignore _redis_command('zadd', 'queue_selenium', mapping, nx=nx, xx=xx) return if typing.TYPE_CHECKING: entries = typing.cast(Link, entries) _redis_command('set', entries.name, pickle.dumps(entries), nx=True) mapping = { entries.name: score, } with _redis_get_lock('lock_queue_selenium'): # type: ignore _redis_command('zadd', 'queue_selenium', mapping, nx=nx, xx=xx)
def dodaj_dane(dane): for model, plik in dane.items(): pola = [pole for pole in model._meta.fields] pola.pop(0) wpisy = dane_z_pliku(plik + '.txt', ';') print(wpisy) with baza.atomic(): for batch in chunked(wpisy, 100): model.insert_many(batch, fields=pola).execute() baza.commit()
def _save_requests_redis(entries: 'Union[Link, List[Link]]', single: bool = False, score: 'Optional[float]' = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Forces ``ZADD`` to only create new elements and not to update scores for elements that already exist. xx: Forces ``ZADD`` to only update scores of elements that already exist. New elements will not be added. """ if score is None: score = time.time() if not single: if TYPE_CHECKING: entries = cast('List[Link]', entries) for chunk in peewee.chunked(entries, BULK_SIZE): pool = list(filter(lambda link: isinstance(link, Link), chunk)) # type: List[Link] for link in pool: _redis_command('set', link.name, pickle.dumps(link), nx=True) with _redis_get_lock('queue_requests'): _redis_command('zadd', 'queue_requests', {link.name: score for link in pool}, nx=nx, xx=xx) return None if TYPE_CHECKING: entries = cast('Link', entries) _redis_command('set', entries.name, pickle.dumps(entries), nx=True) with _redis_get_lock('queue_requests'): _redis_command('zadd', 'queue_requests', { entries.name: score, }, nx=nx, xx=xx) return None
def reload_default(cls): if not KyogreDB._db: return try: cls.delete().execute() except: pass with open('data/pkmn_data.json', 'r') as f: pkmn_data = json.load(f) with KyogreDB._db.atomic(): for chunk in chunked(pkmn_data, 50): cls.insert_many(chunk).execute()
def save_bar_data(self, bars: List[BarData]) -> bool: """""" # Store key parameters bar = bars[0] symbol = bar.symbol exchange = bar.exchange interval = bar.interval # Convert bar object to dict and adjust timezone data = [] for bar in bars: bar.datetime = convert_tz(bar.datetime) d = bar.__dict__ d["exchange"] = d["exchange"].value d["interval"] = d["interval"].value d.pop("gateway_name") d.pop("vt_symbol") data.append(d) # Upsert data into database with self.db.atomic(): for c in chunked(data, 50): DbBarData.insert_many(c).on_conflict_replace().execute() # Update bar overview overview: DbBarOverview = DbBarOverview.get_or_none( DbBarOverview.symbol == symbol, DbBarOverview.exchange == exchange.value, DbBarOverview.interval == interval.value, ) if not overview: overview = DbBarOverview() overview.symbol = symbol overview.exchange = exchange.value overview.interval = interval.value overview.start = bars[0].datetime overview.end = bars[-1].datetime overview.count = len(bars) else: overview.start = min(bars[0].datetime, overview.start) overview.end = max(bars[-1].datetime, overview.end) s: ModelSelect = DbBarData.select().where( (DbBarData.symbol == symbol) & (DbBarData.exchange == exchange.value) & (DbBarData.interval == interval.value) ) overview.count = s.count() overview.save()
def viewscsvtosql(path, datatype=1): # 从csv文件读入数据 df = pd.read_csv(path) df['time'] = pd.to_datetime(df['time']) df = df.fillna('') # 填充NA数据 # 遍历读取处理 news_data = [] for index, row in df.iterrows(): tmp = {} tmp['viewid'] = row['id'] tmp['personname'] = row['person_name'] if row['country'] == 'N': tmp['country'] = "" else: tmp['country'] = row['country'] tmp['orgname'] = row['org_name'] tmp['pos'] = row['pos'] tmp['verb'] = row['verb'] tmp['viewpoint'] = row['viewpoint'] tmp['newsid'] = row['news_id'] tmp['sentiment'] = row['sentiment'] tmp['original_text'] = row['original_text'] tmp['time'] = datetime.strftime(row['time'],'%Y-%m-%d %H:%M:%S') # 格式化时间字符串 news_data.append(tmp) # write data to mysql mysql_db.connect() # 插入新闻数据 if not ViewsInfo.table_exists(): # 如果表不存在则创建 ViewsInfo.create_table() # else: # bug调好后注释掉, 改为增量 # ViewsInfo.delete().execute() # 每次重新更新之前清空数据表 ''' # 根据切片分批次插入 slice_size = 300 # 切片大小 nslices = math.floor(len(news_data) / slice_size) for i in range(0, nslices): with mysql_db.atomic(): ViewsInfo.insert_many(news_data[i * slice_size: (i + 1) * slice_size]).execute() # 批量插入 # 插入最后一个切片的数据 with mysql_db.atomic(): ViewsInfo.insert_many(news_data[nslices*slice_size:]).execute() # 批量插入 ''' with mysql_db.atomic(): for batch in chunked(news_data, 300): # 一次300条 ViewsInfo.insert_many(batch).on_conflict_ignore().execute() # 批量插入, 主键重复则忽略该条 mysql_db.close() return
def resolve_vods_by_canon_ids(ids): vods = [] for batch in peewee.chunked(ids, 100): results = list( PerfVODModel.select().where(PerfVODModel.canon_id << batch)) if results: vmap = {v.canon_id: v for v in results} else: vmap = {} for canon_id in batch: vods.append(vmap.get(canon_id)) return [PerfVOD.from_database_model(v) if v else None for v in vods]
def load_status_updates(filename, status_collection): ''' Opens a CSV file with status data and adds it to an existing instance of UserStatusCollection Requirements: - If a status_id already exists, it will ignore it and continue to the next. - Returns False if there are any errors (such as empty fields in the source CSV file) - Otherwise, it returns True. ''' logger.debug('Entering load_status_updates(%s, %s)', filename, status_collection) db_fields = ('status_id', 'user_id', 'status_text') status_data = [] status_ids = set() if not filename.endswith('.csv'): logger.error('UserStatus files must be in csv format') return False with open(filename, newline='') as statusfile: logger.info('Saving statuses to %s', filename) data = csv.reader(statusfile) try: next(data) # Eat header data except StopIteration: logger.error('%s does not contain data', filename) return False for row in data: # There should be three parameters if len(row) != 3: logger.error('UserStatus needs three parameters') return False # Empty strings, 0, and None are all considered empty fields for item in row: if not item: logger.error('UserStatus can not have empty fields') return False # Filter for duplicate status_ids if row[0] not in status_ids: # Should only be called with three non-empty parameters logger.debug('Adding %s to collection', row) status_data.append(row) status_ids.add(row[0]) logger.info('Saving collection in database') Status = status_collection.table # pylint: disable=C0103 with status_collection.database.transaction(): for batch in chunked(status_data, 100): Status.insert_many(batch, fields=db_fields).execute() return True
def update_category_videos(self, cid): _channels_by_cat_id = self.api_request(self.base_url, {"get_videos_by_cat_id": cid}) videos = [] streams = [] for c in _channels_by_cat_id: videos.append({ "_id": c.get("id"), "cid": c.get("cid"), "title": c.get("video_title"), "thumbnail": self.image_url(c.get("video_thumbnail_b")), }) for s in c.get("stream_list"): streams.append({ "_id": s.get("vod_stream_id"), "channel_id": c.get("id"), "name": s.get("name"), "stream_url": s.get("stream_url"), "token": s.get("token"), "agent": s.get("agent"), "referer": s.get("referer_vod"), }) with db.atomic(): if len(streams) > 2: """ Data fetch successful delete old data """ cat_channels = [ v._id for v in Video.select().where(Video.cid == cid) ] VodStream.delete().where( VodStream.channel_id.in_(cat_channels)).execute() Video.delete().where(Video.cid == cid).execute() for batch in pw.chunked(videos, 100): Video.replace_many(batch).execute() for batch in pw.chunked(streams, 100): VodStream.replace_many(batch).execute()
def update_category_channels(self, cid): _channels_by_cat_id = self.api_request(self.base_url, {"get_channels_by_cat_id": cid}) channels = [] streams = [] for c in _channels_by_cat_id: channels.append({ "_id": c.get("id"), "cid": c.get("cid"), "title": c.get("channel_title"), "thumbnail": self.image_url(c.get("channel_thumbnail")), }) for s in c.get("stream_list"): streams.append({ "_id": s.get("stream_id"), "channel_id": c.get("id"), "name": s.get("name"), "stream_url": s.get("stream_url"), "token": s.get("token"), "agent": s.get("agent"), "referer": s.get("referer"), }) with db.atomic(): if len(streams) > 2: """ Data fetch successful delete old data """ cat_channels = [ c._id for c in Channel.select().where(Channel.cid == cid) ] Stream.delete().where( Stream.channel_id.in_(cat_channels)).execute() Channel.delete().where(Channel.cid == cid).execute() for batch in pw.chunked(channels, 100): Channel.replace_many(batch).execute() for batch in pw.chunked(streams, 100): Stream.replace_many(batch).execute()
def move_df_to_db(imported_data:pd.DataFrame,future_download:bool): print("move_df_to_db 函数") bars = [] count = 0 time_consuming_start = time() tmpsymbol = None start_time = None for row in imported_data.itertuples(): bar = BarData( datetime=datetime.fromtimestamp(row.datetime), #为标准datetime格式,非datetime64[ns],timeStamp symbol=row.symbol, exchange=row.exchange, interval=row.interval, open_price=row.open_price, high_price=row.high_price, low_price=row.low_price, close_price=row.close_price, # open_interest=row.open_interest, volume=row.volume, gateway_name="DB", ) if not tmpsymbol : tmpsymbol = bar.symbol if future_download: # 夜盘时间21:00 - 2:30 日期减1天 if bar.datetime.time() >= dtime(21,0) or bar.datetime.time() <= dtime(2,30): bar.datetime -= timedelta(days=1) # 其他时间分钟减1 ??? bar.datetime-= timedelta(minutes=1) if not start_time: start_time = bar.datetime bars.append(bar) # do some statistics count += 1 end_time = bar.datetime # insert into database for bar_data in chunked(bars, 10000): # 分批保存数据 database_manager.save_bar_data(bar_data) time_consuming_end =time() print(f'载入通达信标的:{tmpsymbol} 分钟数据,开始时间:{start_time},结束时间:{end_time},数据量:{count},耗时:{round(time_consuming_end-time_consuming_start,3)}秒')
def populate(experiments, experiment_teams): """ Populates the database with experiment data. Args: experiments (list[dict]): A list of dictionaries containing information on experiments. experiment_teams (list[exp_db_populator.data_types.ExperimentTeamData]): A list containing the users for all new experiments. """ if not experiments or not experiment_teams: raise KeyError("Experiment without team or vice versa") for batch in chunked(experiments, 100): Experiment.insert_many(batch).on_conflict_replace().execute() teams_update = [{ Experimentteams.experimentid: exp_team.rb_number, Experimentteams.roleid: exp_team.role_id, Experimentteams.startdate: exp_team.start_date, Experimentteams.userid: exp_team.user.user_id } for exp_team in experiment_teams] for batch in chunked(teams_update, 100): Experimentteams.insert_many(batch).on_conflict_ignore().execute()
def bulk_insert(cls, data: Iterable[dict]) -> int: """ BaseModel.bulk_insert(row_dict: dict) -> int Performs an atomic bulk insertion of the data provided in row_dicts and returns the number of rows inserted :param data: sequence of dictionary objects mapping field names to values to insert :return int: The number of rows inserted :raise peewee.PeeweeException: if bulk insertion fails """ rows_inserted = 0 with cls._meta.database.atomic() as transaction: for batch in chunked(data, 100): rows_inserted += cls.insert_many(batch).execute() return rows_inserted
def copy_db(): # Name_2, Petition_2, User # db.drop_tables([Peticia], safe=True) # db.create_tables([Peticia], safe=True) # db.create_tables([Name], safe=True) # query = Name_2.select() # data = [] # for i in query: # data.append((i.username, i.gender)) # # with db.atomic(): # # by default SQLite limits the number of bound variables in a SQL query to 999 # for batch in chunked(data, 450): # Name.insert_many(batch, fields=[Name.username, Name.gender]).execute() # # # # db.create_tables([Petition], safe=True) # query = Petition_2.select() # for i in query: # Petition.create( # petition_id=i.petition_id, # status=i.status, # title=i.title, # article=i.article, # answer=i.answer # ) # # # db.create_tables([Vote], safe=True) for i in User.select(User.petition_id).distinct(): data = [] # p = Petition.get(petition_id=i.petition_id).petition_id p = i.petition_id query = User.select().where(User.petition == i.petition_id) for user in query: data.append((p, user.position_number, user.username, user.sign_date, user.gender)) with db.atomic(): # by default SQLite limits the number of bound variables in a SQL query to 999 for batch in chunked(data, 198): # User.insert_many(batch, fields=[User.petition, User.position_number, # User.username, User.sign_date, User.gender]).execute() Vote.insert_many(batch).execute()