def prepare(self): df_changed = False for file, year in self.fs.data_files: if get_file_encoding(file) == "": print_flush(f"Determining encoding of file '{file}': ", end="") read_file(file) print_flush("done!") df_changed = True if df_changed: self.fs.disconnect() self.fs.connect() @db_session def _prepare(session, db): aux_collection = self.get_aux_collection(session, db) if len(self.fs.data_files) == 0: return DbOperation(session, db).insert_data(aux_collection, [{ "file_name": file, "year": year, "file_seek": 0, "header": "", "tr_id": 0 } for file, year in self.fs.data_files], "FILL AUX COLLECTION", use_session=False) return False return _prepare(self)
def _insert_rows(session, db): aux_collection = self.get_aux_collection(session, db) target_collection = self.get_target_collection(session, db) DbOperation(session, db).insert_data(target_collection, rows, "INSERT ROWS") if end: print_flush( f"\r\x1b[1K\rPopulating from file '{file_name}' ({year}): {' ' * 35}", end="") print_flush( f"\r\x1b[1K\rPopulating from file '{file_name}' ({year}): done!" ) DbOperation(session, db).delete_data(aux_collection, entry, "DROP AUX COLLECTION") else: entry["file_seek"] = file.tell() entry["tr_id"] = entry["tr_id"] + 1 DbOperation(session, db).insert_data( aux_collection, [{ "file_name": file_name, "year": year, "file_seek": file.tell(), "header": header_text, "tr_id": entry["tr_id"] }], "UPDATE FILE SEEK")
def main(): print_flush("\n\n\n") print_flush("Populate script started\n") use_env_files() populate = Populate() with populate: while handle_state(populate): print_flush() print_flush("\nPopulate script stopped")
def do_query(self): print_flush("Executing query...", end="") @db_session def _do_query(session, db): target_collection = self.get_target_collection(session, db) result = target_collection.aggregate([{ "$match": { "$or": [{ "year": 2019 }, { "year": 2020 }], "PHYSTESTSTATUS": "Зараховано" } }, { "$group": { "_id": { "year": "$year", "REGNAME": "$REGNAME", }, "max_ball": { "$max": "$PHYSBALL100" } } }]) stats = dict() for r in result: if r["_id"]["REGNAME"] not in stats: stats[r["_id"]["REGNAME"]] = dict() stats[r["_id"]["REGNAME"]][r["_id"]["year"]] = float( r["max_ball"].replace(",", ".")) return stats stats = _do_query(self) with open(os.path.join(self.fs.query_folder, "query_result.csv"), "w") as f: f.write("Region,MaxPhysBall100_2019,MaxPhysBall100_2020\n") for s in stats.items(): f.write(f"{s[0]},{s[1].get(2019)},{s[1].get(2020)}\n") print_flush(" done!")
def handle_state(populate): state = populate.get_state() if state == "clear": while True: sel = ask_variants("Db is clear.\n", { "r": "reload state", "s": "start population", "e": "exit", }) if sel == "r": return reload(populate) elif sel == "s": reload(populate) if populate.get_state() != state: return True return start(populate) elif sel == "e": return False print_flush() elif state == "finished": while True: sel = ask_variants( "Looks like db is populated.\n", { "r": "reload state", "q": "execute test query", "d": "drop db", "e": "exit", }) if sel == "r": return reload(populate) elif sel == "q": reload(populate) if populate.get_state() != state: return True populate.do_query() elif sel == "d": reload(populate) if populate.get_state() != state: return True if ask_confirm(): reload(populate) if populate.get_state() != state: return True return drop_finished(populate) elif sel == "e": return False print_flush() elif state == "interrupted": while True: sel = ask_variants( "Looks like the population was interrupted.\n", { "r": "reload state", "c": "continue population", "f": "assume population was finished", "d": "drop db", "e": "exit" }) if sel == "r": reload(populate) if populate.get_state() != state: return True return reload(populate) elif sel == "c": reload(populate) if populate.get_state() != state: return True return resume(populate) elif sel == "f": reload(populate) if populate.get_state() != state: return True if ask_confirm(): reload(populate) if populate.get_state() != state: return True return assume_finished(populate) elif sel == "d": reload(populate) if populate.get_state() != state: return True if ask_confirm(): reload(populate) if populate.get_state() != state: return True return drop_interrupted(populate) elif sel == "e": return False print_flush() return False
def clear_artifacts(populate): print_flush("Clearing artifacts...") populate.drop_aux() return True
def drop_finished(populate): print_flush("Dropping...") populate.drop_target() return True
def start(self): @db_session def _get_entries(session, db): target_collection = self.get_target_collection(session, db) dummy = target_collection.find_one({"dummy": 0}) if dummy is None: DbOperation(session, db).insert_data(target_collection, [{ "dummy": 0 }], "INSERT TARGET DUMMY", use_session=False) aux_collection = self.get_aux_collection(session, db) return aux_collection.find().sort("tr_id", pymongo.DESCENDING) entries = _get_entries(self) if entries.count() == 0: @db_session def _delete_dummy(session, db): target_collection = self.get_target_collection(session, db) dummy = target_collection.find_one({"dummy": 0}) if dummy is not None: DbOperation(session, db).delete_data(target_collection, dummy, "INSERT TARGET DUMMY") _delete_dummy(self) self.drop_aux() return True entry = entries[0] entry_id, file_name, year, file_seek, header_text, tr_id = \ entry["_id"], entry["file_name"], entry["year"], entry["file_seek"], entry["header"], entry["tr_id"] file_size = get_file_size(file_name) print_flush(f"Populating from file '{file_name}' ({year}): ", end='') with open(file_name, "r", encoding=get_file_encoding(file_name)) as file: if file_seek == 0: entry["header"] = header_text = file.readline().strip() entry["file_seek"] = file.tell() @db_session def _set_header_text(session, db): aux_collection = self.get_aux_collection(session, db) DbOperation(session, db).update_data(aux_collection, entry, "SAVE AUX HEADER") _set_header_text(self) else: file.seek(file_seek) header = strip_arr(header_text.split(';')) header = [h.upper() for h in header] batch_size = 1000 while True: print_flush( f"\rPopulating from file '{file_name}' ({year}): " f"{format_file_size(entry['file_seek'])} / {format_file_size(file_size)} " f"({entry['file_seek'] / file_size:.2%})", end="") end = False rows = [] for i in range(batch_size): line = [] prev_line_text = "" while True: line_text = prev_line_text + file.readline().strip() if not line_text: end = True break line = [ strip(l) for l in line_text.rstrip().split(';') ] if len(line) == len(header): break prev_line_text = line_text if end: break row = {} for h, v in zip(header, line): row[h] = v row["year"] = year rows.append(row) @db_session def _insert_rows(session, db): aux_collection = self.get_aux_collection(session, db) target_collection = self.get_target_collection(session, db) DbOperation(session, db).insert_data(target_collection, rows, "INSERT ROWS") if end: print_flush( f"\r\x1b[1K\rPopulating from file '{file_name}' ({year}): {' ' * 35}", end="") print_flush( f"\r\x1b[1K\rPopulating from file '{file_name}' ({year}): done!" ) DbOperation(session, db).delete_data(aux_collection, entry, "DROP AUX COLLECTION") else: entry["file_seek"] = file.tell() entry["tr_id"] = entry["tr_id"] + 1 DbOperation(session, db).insert_data( aux_collection, [{ "file_name": file_name, "year": year, "file_seek": file.tell(), "header": header_text, "tr_id": entry["tr_id"] }], "UPDATE FILE SEEK") _insert_rows(self) @db_session def _remove_old_aux(session, db): aux_collection = self.get_aux_collection(session, db) for old_entry in aux_collection.find( ({ "file_name": file_name } if end else { "file_name": file_name, "tr_id": { "$ne": entry["tr_id"] } })): DbOperation(session, db).delete_data(aux_collection, old_entry) _remove_old_aux(self) if end: return self.start()