Exemplos de TimingDict.combine em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: lsst.dax.data_generator

Classe / Tipo: TimingDict

Método / Função: combine

Exemplos em hotexamples.com: 2

TimingDict.combine em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de lsst.dax.data_generator.TimingDict.combine em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

TimingDict(6)

add(2)

combine(2)

increment(2)

report(2)

count(1)

end(1)

start(1)

Métodos Frequentes

TimingDict (6)

add (2)

combine (2)

increment (2)

report (2)

count (1)

end (1)

start (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: DataGenClient.py Projeto: lsst-dm/dax_data_generator

class DataGenClient: """This class is used to connect to the DataGenServer and build chunks. Parameters ---------- host : str Name of the server's host. port : int Server's port number. target_dir : str, optional The client's working directory. Clients must have different working directories. chunks_per_req : int, optional The number of chunks wanted per request from the server. Note ---- This class is used to connect to the DataGenServer and uses the information the server provides to generate appropriate fake chunks while reporting what chunks have been created and registered with the ingest system back to the server. """ def __init__(self, host, port, retry=False, target_dir='fakeData', chunks_per_req=1): self._host = host self._port = port self._name = "-1" self._retry = retry # Retry connection if true self._target_dir = os.path.abspath(target_dir) self._chunksPerReq = chunks_per_req self._gen_arg_str = None # Arguments from the server for the generator. self._cl_conn = None # DataGenConnection self._cfg_file_name = 'gencfg.py' # name of the local config file for the generator self._cfg_file_contents = None # contents of the config file. self._pt_cfg_dir = os.path.join( self._target_dir, 'partitionCfgs') # sub-dir for partitioner configs self._pt_cfg_dict = None # Dictionary that stores partioner config files. self._pregen_dir = os.path.join( self._target_dir, 'pregenerated') # sub-dir for pre-generated files self.makeDir(self._target_dir) self.makeDir(self._pt_cfg_dir) self.makeDir(self._pregen_dir) # Values set from transferred self._cfgFileContents (see _readDatagenConfig) self._spec = None # spec from exec(self._cfgFileContents) self._directors = None # directors from exec(self._cfgFileContents) self._chunker = None # chunker from exec(self._cfgFileContents) self._edge_width = None # float Width of edges in edge only generation. # DataGenerator, cannot be initialized until '_spec' received from server self._data_gen = None self._objects = None # int number of objects set self._visits = None # int number of visits self._seed = None # int random number seed # Ingest values self._ingest = None self._skip_ingest = True self._db_name = '' self._transaction_id = -1 self._keep_csv = True # keep intermediate files for debugging # timing information self._timing_dict = TimingDict() def _setIngest(self, ingest_dict): """Create ingest object from ingest_dict values. Parameters ---------- ingest_dict : dictionary Dictionary containing information about the ingest system. 'host' : str, ingest system host name. 'port' : int, ingest port number. 'auth' : str, ingest authorization. 'db' : str, name of the databse being created 'skip' : bool, true if ingest is being skipped. 'keep' : bool, true if intermediate files should be kept. Note ---- The keys in ingest_dict should match those in servRespInit and clientRespInit. """ ingd = ingest_dict self._ingest = DataIngest(ingd['host'], ingd['port'], ingd['auth']) self._skip_ingest = ingd['skip'] self._db_name = ingd['db'] self._keep_csv = ingd['keep'] def createFileName(self, chunk_id, table_name, ext, edge_only=False, use_targ_path=False): """Create a consistent file name given the input parameters. Parameters ---------- chunk_id : int Chunk id number. table_name : str Name of the table related to this file. ext : str Extension of the file. edge_only : bool, optional If True, the name indicates the file only contains information about objects near the edges of the chunk needed for overlap. If False, the name indicates the file contains all objects in the chunk. use_targ_path : bool, optional If True, the file name will start with self._target_dir Return : fn : str The name to use for the file. """ typeStr = "EO_" if edge_only else "CT_" # If the tabelName is a wildcard, don't use typeStr if table_name == '*': typeStr = '' fn = 'chunk' + str(chunk_id) + '_' + typeStr + table_name + '.' + ext if use_targ_path: fn = os.path.join(self._target_dir, fn) return fn def makeDir(self, dir_name): """Make a directory catching the already exists exception. Parameters ---------- dir_name : str Name of the directory to create. Return ------ sucess : bool True if directory was created or already existed. """ try: os.mkdir(dir_name) except OSError as err: if err.errno != errno.EEXIST: print("ERROR directory creation", dir_name, err) return False return True def removeFile(self, f_name): """Return True if the file f_name was removed, false otherwise. """ print("removing file", f_name) try: os.remove(f_name) except OSError as err: print("ERROR remove failed", f_name, err) return False return True def runProcess(self, cmd, cwd=None): """Run a process. Parameters ---------- cmd : str The command to be run. This includes all command line arguments. cwd : str, optional The current working directory for the command. If this is None, cwd will be set to self._target_dir before running cmd. Return ------ process.returncode : int Process depenedant, but non-zero usually indicates failure. out_str : str Process terminal output. """ if not cwd: cwd = self._target_dir print("cwd", cwd, "cmd=", cmd) process = subprocess.run(cmd, cwd=cwd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out_str = str(process.stdout) if process.returncode != 0: print("out=", out_str) return process.returncode, out_str def _readDatagenConfig(self): """ Create a Chunker and spec using the same configuration file as the datagen.py. """ # Load the python configuration file used to generate the synthetic data. # spec defines tables and columns. # chunker defines the partitioning scheme # edge_width should be at least as wide as the partitioning overlap. spec_globals = {} exec(self._cfg_file_contents, spec_globals) assert 'spec' in spec_globals, "Specification file must define a variable 'spec'." assert 'directors' in spec_globals, "Specification file must define a variable 'directors'." assert 'chunker' in spec_globals, "Specification file must define a variable 'chunker'." assert 'edge_width' in spec_globals, "Specification file must define a variable 'edge_width'." self._spec = spec_globals['spec'] self._directors = spec_globals['directors'] self._chunker = spec_globals['chunker'] self._edge_width = spec_globals['edge_width'] print("_cfgFileContents=", self._cfg_file_contents) print("_spec=", self._spec) self._data_gen = DataGenerator(self._spec, self._chunker, pregen_dir=self._pregen_dir) def findCsvInTargetDir(self, chunk_id, neighbor_chunks): """Find files required csv to generate overlap for chunk_id. Parameters ---------- chunk_id : int The chunk which needs an overlap table. neighbor_chunks : list of int A list of all the chunk_ids for the chunks next to chunk_id. Return ------ success : bool True, there are the required csv files for chunk_id. foundCsv : list of str A list of csv files for chunkId and neighborChunks. neededEdgeOnly : list of int A list chunk ids that need to be generated edgeOnly. Note ---- To generate the overlap file, there needs to be the complete csv file for chunk_id and either complete or edge only csv files for all of its neighbors. """ success = True foundCsv = [] neededEdgeOnly = [] # Find the relevant files in self._targetDir. findCsv = self.createFileName(chunk_id, '*', 'csv', use_targ_path=True) # TODO: python3 has better version of glob that would make this code cleaner. chunkCsvPaths = glob.glob(findCsv) print("findCsv=", findCsv, " paths=", chunkCsvPaths) # Remove the path and add the files to a list chunkCsvFiles = [] for fn in chunkCsvPaths: chunkCsvFiles.append(os.path.basename(fn)) print("chunkCsvFiles=", chunkCsvFiles) # These cannot be edgeOnly, and there must be one for # each entry in self._spec['spec'] for tblName in self._spec: if "from_file" in self._spec[tblName]: print("skipping pregenerated", tblName) continue fn = self.createFileName(chunk_id, tblName, 'csv', edge_only=False, use_targ_path=False) if fn not in chunkCsvFiles: print("Failed to find ", fn, "in", chunkCsvFiles) success = False return success, foundCsv, neededEdgeOnly foundCsv.append(fn) # remove chunkId from neighborChunks, if it is there nbChunks = [val for val in neighbor_chunks if val != chunk_id] # See if the neighbor chunk csv files exist. They can be complete or edge only. for nbCh in nbChunks: allTablesFound = True completeFound = False eOFound = False for tblName in self._spec: fn = self.createFileName(nbCh, tblName, 'csv', edge_only=False, use_targ_path=False) fnEO = self.createFileName(nbCh, tblName, 'csv', edge_only=True, use_targ_path=False) if fn in chunkCsvFiles: foundCsv.append(fn) completeFound = True elif fnEO in chunkCsvFiles: foundCsv.append(fnEO) eOFound = True else: allTablesFound = False if completeFound and eOFound: print( "WARN both complete and edgeOnly files found for neighbor chunk=", nbCh) # That shouldn't have happened, maybe left over from previous run. # Remove all of the files for nbCh and make sure it is added # to neededEdgeOnly. if not self.removeFilesForChunk( nbCh, edge_only=True, complete=True): print("WARN failed to remove files for nbCh=", nbCh) allTablesFound = False if not allTablesFound: neededEdgeOnly.append(nbCh) return success, foundCsv, neededEdgeOnly def removeFilesForChunk(self, chunk_id, edge_only=False, complete=False): """Remove files for chunk_id from the target directory. Parameters ---------- chunk_id : int Chunk id number for the csv and parquet files to remove. edge_only : bool When True, remove edge only csv and parquet files. complete : bool When True, remove complete csv and parquet files Returns ------ success : bool True if all files were removed. """ print("removeFilesForChunk", chunk_id, edge_only, complete) for tblName in self._spec: fList = [] if edge_only: fList.append( self.createFileName(chunk_id, tblName, 'csv', edge_only=True, use_targ_path=True)) fList.append( self.createFileName(chunk_id, tblName, 'parquet', edge_only=True, use_targ_path=True)) if complete: fList.append( self.createFileName(chunk_id, tblName, 'csv', edge_only=False, use_targ_path=True)) fList.append( self.createFileName(chunk_id, tblName, 'parquet', edge_only=False, use_targ_path=True)) for fn in fList: if os.path.exists(fn): print(f"chunk_id={chunk_id} removing fn={fn}") if not self.removeFile(fn): print("ERROR remove failed", fn) return False return True def _fillChunkDir(self, chunk_id, neighbor_chunks): """Create and fill a directory with all the csv files needed for the partitioner to make chunk_id. Parameters ---------- chunk_id : int Chunk id for the chunk that needs an overlap table. neighbor_chunks : list of int Chunk id numbers for the chunks next to chunk_id. Return ------ success : bool True indicates success. Note ---- The directory is self._targetDir/<chunkId> and should contain all the csv files needed for the partioner to build chunk and overlap files for ingest. One file for each table in each chunk. """ print("fillChunkDir chunkId=", chunk_id, neighbor_chunks) # If the chunk directory already exists, empty it. if not os.path.exists(self._target_dir): print("ERROR targetDirectory does not exist.") return False chunkIdStr = str(chunk_id) if chunkIdStr == "": print("ERROR chunkIdStr is empty") return False dirName = os.path.join(self._target_dir, str(chunk_id)) if os.path.exists(dirName): # It shouldn't exist, delete it shutil.rmtree(dirName) if not self.makeDir(dirName): print("ERROR directory creation", dirName) return False cList = neighbor_chunks.copy() if chunk_id not in cList: cList.append(chunk_id) for cId in cList: # Only the 'CT' or 'EO' csv files should exist, so hard link # all csv files for the chunks. pattern = 'chunk' + str(cId) + '_*.csv' pattern = os.path.join(self._target_dir, pattern) fList = glob.glob(pattern) for fn in fList: linkName = os.path.basename(fn) linkName = os.path.join(self._target_dir, str(chunk_id), linkName) try: os.link(fn, linkName) except OSError as err: if err.errno != errno.EEXIST: print("ERROR fillChunkDir link failed", fn, linkName) return False return True def _datGenChunk(self, chunk_id, edge_only): row_counts = {"CcdVisit": self._visits, "Object": self._objects} # ForcedSource count is defined by visits and objects. if ("ForcedSource" in self._spec): row_counts["ForcedSource"] = None self._data_gen.timingdict = TimingDict() tables = self._data_gen.make_chunk(chunk_id, edge_width=self._edge_width, edge_only=edge_only) self._data_gen.timingdict.increment() self._timing_dict.combine(self._data_gen.timingdict) print("tables=", tables) for table_name, table in tables.items(): edge_type = "EO" if edge_only else "CT" fname = "chunk{:d}_{:s}_{:s}.csv".format(chunk_id, edge_type, table_name) fname = os.path.join(self._target_dir, fname) table.to_csv(fname, header=False, index=False) def _generateChunk(self, chunk_id, edge_only=False): """Generate the csv files for a chunk. Parameters ---------- edge_only : bool True - will cause an edge only Chunk to be generated. The edge only chunks will not be created if there is an existing complete chunk. False - will cause a complete chunk to be created and will result in existing csv files for that chunk id to be deleted. Return ------ status : str 'success' if the chunk was made successfully 'failed' if a valid version of the chunk could not be made 'existed' if a valid complete version of the chunk already existed and edgeOnly=True """ print(f"generateChunk chunk_id={chunk_id} edge_only={edge_only}") if edge_only: # Check for existing csv files. If a full set of complete files are found or # a full set of edge only files are found, return 'existed' # If there is a full set of complete files, delete the edge only files and return. edgeOnlyCount = 0 completeCount = 0 spec = self._spec for tblName in spec: fn = self.createFileName(chunk_id, tblName, 'csv', edge_only=True, use_targ_path=True) if os.path.exists(fn): edgeOnlyCount += 1 fn = self.createFileName(chunk_id, tblName, 'csv', edge_only=False, use_targ_path=True) if os.path.exists(fn): completeCount += 1 spec_count = 0 for sp in spec: if "from_file" not in spec[sp]: spec_count += 1 print( f"spec_count={spec_count} edgeOnlyCount={edgeOnlyCount} completeCount={completeCount}" ) if completeCount == spec_count or edgeOnlyCount == spec_count: print( "All expected tables already exist, will not generate. chunkid=", chunk_id) if completeCount == spec_count: print("Removing extraneous edgeOnly files") if not self.removeFilesForChunk( chunk_id, edge_only=True, complete=False): print("WARN failed to remove extraneous csv for", chunk_id) else: # Not a full set of complete files print("Removing extraneous complete files") if not self.removeFilesForChunk( chunk_id, edge_only=False, complete=True): print("WARN failed to remove incomplete csv for", chunk_id) return 'exists' else: # Delete files for this chunk if they exist. if not self.removeFilesForChunk( chunk_id, edge_only=True, complete=True): print("WARN failed to remove all files for chunk=", chunk_id) # Genrate the chunk csv files. try: self._datGenChunk(chunk_id, edge_only) except IndexError as ie: print(f"ERROR Generator failed for {chunk_id} error={ie}") return 'failed' except RuntimeError as re: print(f"ERROR Generator failed for {chunk_id} error={re}") return 'failed' return 'success' def _createRecvChunks(self, chunk_recv_set): """Create csv files for all tables in chunk_recv_set. Parameters ---------- chunk_recv_set : set of int Set of chunk ids most recently received from the server. Return ------ created_chunks : list of int List of chunk ids where all csv tables were created. """ created_chunks = [] for chunk_id in chunk_recv_set: # Generate the csv files for the chunk if self._generateChunk(chunk_id, edge_only=False) != 'failed': self._timing_dict.increment() # increment the count of chunks created_chunks.append(chunk_id) return created_chunks def _createNeighborChunks(self, created_chunks): """Create neighbor chunks for all created chunks as needed. Parameters ---------- created_chunks : list of int List of chunk ids that have been created from the most recent list sent by the server. Return ------ have_all_csv_chunks : list of int List of chunk ids from created_chunks where all the neccessary neighbor chunks could be created or already existed. Note ---- Neighbor chunks may be edge only, but the tables in created_chunks must be complete chunks. """ chunker = self._chunker have_all_csv_chunks = [] for chunk in created_chunks: # Find the chunks that should be next to chunk neighborChunks = chunker.getChunksAround(chunk, self._edge_width) # Find the output files for the chunk, name must match "chunk<id>_*.csv" foundCsv, filesCsv, neededChunks = self.findCsvInTargetDir( chunk, neighborChunks) print("foundCsv=", foundCsv, "fCsv=", filesCsv, " needed=", neededChunks) if not foundCsv: print( "ERROR Problems with finding essential csv for creating overlap chunk=", chunk, filesCsv) continue # Create edgeOnly neededChunks createdAllNeeded = True for nCh in neededChunks: genResult = self._generateChunk(nCh, edge_only=True) if genResult == 'failed': print("ERROR Failed to generate chunk", nCh) createdAllNeeded = False continue if createdAllNeeded: print("Created all needed edgeOnly for ", nCh) # Put hardlinks to all the files needed for a chunk in # a specific directory for the partioner to use to create # the overlap tables and so on. if self._fillChunkDir(chunk, neighborChunks): have_all_csv_chunks.append(chunk) return have_all_csv_chunks def _createOverlapTables(self, chunkId): """Create ingest files and pass them to the ingest system. Parameters ---------- chunk_id : int Chunk id number for which overlap and ingest files are created. Returns ------- chunks_added : bool True if chunks were added to the transaction for ingest. Note ---- This needs to be done for each table in the chunk which has a matching partitioner configuration file. The partitioner creates files for all input chunks, chunk_id and all of its neighbors. Once all the ingest and overlap files for the target chunk have been made, all the extra files are deleted to prevent the possible ingest of duplicate data. """ # Everything happens in the ovlDir directory ovlDir = os.path.join(self._target_dir, str(chunkId)) entries = os.listdir(ovlDir) files = [] for e in entries: e_path = os.path.join(ovlDir, e) if os.path.isfile(e_path): fstats = os.stat(e_path) if fstats.st_size > 0: files.append(os.path.basename(e)) else: print(f"file {e_path} has size zero {fstats.st_size}") else: print("not a file ", os.path.join(ovlDir, e)) if not files: print("No files with data were found, nothing to partition") return False # for each configuration file in self._partitionerCfgs something like this for Object chunk 0 # sph-partition -c (cfgdir)/Object.cfg --mr.num-workers 1 --out.dir outdirObject # --in.path chunk0_CT_Object.csv --in.path chunk402_CT_Object.csv # --in.path chunk401_CT_Object.csv --in.path chunk400_CT_Object.csv # --in.path chunk404_EO_Object.csv --in.path chunk403_CT_Object.csv # Determine which tables need to be created first. info_list = [] for director, children in self._directors.items(): # Make the director table and index cfg = self._pt_cfg_dict[director] cfg_path = cfg[0] index_path = self._callPartitioner(chunkId, director, cfg_path, ovlDir, files, info_list) # create child tables using index_path for child in children: cfg = self._pt_cfg_dict[child] cfg_path = cfg[0] if not self._callPartitioner(chunkId, child, cfg_path, ovlDir, files, info_list, index_path): raise RuntimeError("Error calling partitioner") # Add the tables to the ingest transaction for info in info_list: print("info=", info, "0=", info[0], "1=", info[1]) st_time = self._timing_dict.start() self._addChunkToTransaction(chunkId, table=info[0], f_path=info[1]) self._timing_dict.end("ingest", st_time) return True def _callPartitioner(self, chunk_id, tbl_name, cfg_fname, ovl_dir, files, info_list, index_path=None): """ Call sph-partition to create '.txt' files for ingest. Parameters ---------- chunk_id : int Chunk id number. tbl_name : str Table name. cfg_fname : str Configuration file name ovl_dir : str Overlap directory. files : list of str List of generated csv files in the ovl_dir. info_list : list of str List of files that should be ingested. index_path : str (optional) Full path of the index.txt file that should be used to determine what chunk child table entries belong to. If this is None, the current table is a director table and an index file should be generated. Returns ------- index_path : str The Full path to the index file created for this chunk or used to determine to which chunk child table rows belong. """ print(f"callPartitioner {chunk_id}, {tbl_name}, {cfg_fname}") st_time = self._timing_dict.start() # The list of --in.path files needs to be generated. It # needs to have all the .csv files for tblName. inCsvFiles = [] reg = re.compile(r"chunk\w*_" + tbl_name + r"\.csv") for f in files: m = reg.match(f) if m: inCsvFiles.append(f) inStr = "" cfgFPath = os.path.join(self._pt_cfg_dir, cfg_fname) outDir = os.path.join(ovl_dir, "outdir" + tbl_name) if not inCsvFiles: print(f"No files with data for table {tbl_name} were found") self._timing_dict.end("overlap", st_time) return index_path for csv in inCsvFiles: inStr += " --in.path " + csv # If index_path empty or undefined, this must be a director table. index_name = f"chunk_{tbl_name.lower()}_index.txt" if not index_path: id_url = "" index_path = os.path.join(outDir, index_name) else: id_url = f"--part.id-url=file://{index_path}" # Put the pieces of the command together and call the partitioner. cmd = "sph-partition -c " + cfgFPath + " --mr.num-workers 1 " cmd += id_url + " --out.dir " + outDir + " " + inStr genResult, genOut = self.runProcess(cmd, cwd=ovl_dir) if genResult != 0: # Raise exception and leave data for diagnostics. raise RuntimeError("ERROR failed to create chunk and overlap " + genOut + " cmd=" + cmd) # Delete the .txt files for files other than chunk_id # and chunk_index.txt in outDir. entries = os.listdir(outDir) reg = re.compile(r"^chunk_" + str(chunk_id) + r"(_overlap)?\.txt$") for ent in entries: fn = os.path.basename(ent) full_path = os.path.join(outDir, ent) m = reg.match(fn) if m: print("keeping ", fn, tbl_name, full_path) info_list.append((tbl_name, full_path)) else: if fn == index_name: print("keeping index ", full_path) else: os.remove(full_path) self._timing_dict.end("overlap", st_time) return index_path def _addChunkToTransaction(self, chunk_id, table, f_path): """ Add chunk-table file to the transaction or raise a RuntimeError. Parameters ---------- chunk_id : int Chunk id number of the file to add to the transaction. table : str Name of the table in the file to add to the transaction. f_path : str The full path to file to add to the transaction. Return ------ out_str : str Output string from program execution. Note ---- The called functions raise RuntimeErrors if they fail. """ if self._skip_ingest: print("skipping ingest", chunk_id, table, f_path) return 0, 'skip' t_id = self._transaction_id host, port = self._ingest.getChunkTargetAddr(t_id, chunk_id) print("Sending to", host, ":", port, "info", t_id, table, chunk_id, f_path) out_str = self._ingest.sendChunkToTarget(host, port, t_id, table, f_path) print("Added to Transaction ", host, ":", port, "info", out_str) return out_str def _sendIngestedChunksToServer(self, chunks_to_send): """Send chunk ids back to the server until the list is empty. Parameters ---------- chunks_to_send : list of int List of ingested chunk id numbers to send back to the server to indicate that they have been ingested. The list is destroyed as it is sent. Note ---- If the initial 'chunks_to_send' list is empty, it is important to send it to the server to indicate there was a local problem and that the server should abandon this connection. """ while True: chunks_to_send = self._cl_conn.clientReportChunksComplete( chunks_to_send) if len(chunks_to_send) == 0: break def deleteAllKeepConfig(self): """Since ingest is complete for this batch, delete everything that isn't a configuration file. """ # Remove the chunk files path = os.path.join(self._target_dir, 'chunk*.csv') print("deleting ", path) files = glob.glob(path) for f in files: os.remove(f) # Remove the chunk sub directories, including their files. dirs = os.listdir(self._target_dir) for dir in dirs: full_path = os.path.join(self._target_dir, dir) if dir.isdecimal() and os.path.isdir(full_path): print("deleting dir", full_path) shutil.rmtree(full_path) def run(self): """Connect to the server and do everything until the server runs out of chunks for this client to generate and ingest. """ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: print(f"host={self._host} port={self._port}") connected = False while not connected: try: s.connect((self._host, self._port)) connected = True except socket.error: print( f"socket failed to connect {self._host}:{self._port}") if not self._retry: exit(1) time.sleep(5) self._cl_conn = DataGenConnection(s) self._cl_conn.clientReqInit() cri = self._cl_conn.clientRespInit() self._name = cri[0] self._objects = cri[1] self._visits = cri[2] self._seed = cri[3] self._cfg_file_contents = cri[4] ingest_dict = cri[5] print("ingest_dict=", ingest_dict) self._setIngest(ingest_dict) print("cfg_file_contents:\n", self._cfg_file_contents) print( f'name={self._name} objects={self._objects} visits={self._visits} seed={self._seed}' f'skip_ingest={self._skip_ingest}') # Read the datagen config file to get access to an identical chunker and spec. self._readDatagenConfig() # Write the configuration file fileName = os.path.join(self._target_dir, self._cfg_file_name) with open(fileName, "w") as fw: fw.write(self._cfg_file_contents) cfg_success, pCfgDict = self._cl_conn.clientGetFiles( "partition cfg") if not cfg_success: raise RuntimeError( "Client failed to receive partitioner config files.") self._pt_cfg_dict = {} for cfg in pCfgDict.items(): cfg_fname = cfg[1][0] # Table name should be config name with extenstion removed. ext = PurePosixPath(cfg_fname).suffix if ext != ".cfg": raise RuntimeError( f"Unexpected partitioner config file sent {cfg_fname}") table_name = PurePosixPath(cfg_fname).stem self._pt_cfg_dict[table_name] = cfg[1] # Write those files to the partitioner config directory for index, cfg_info in pCfgDict.items(): pCfgName = os.path.join(self._pt_cfg_dir, cfg_info[0]) print("writing config", index, "name=", pCfgName) with open(pCfgName, "w") as fw: fw.write(cfg_info[1]) # Read in pregenerated files pregen_success, pregen_dict = self._cl_conn.clientGetFiles( "pregen files") if not pregen_success: raise RuntimeError( "Client failed to receive pregenerated files.") # Write pregenerated files to their directory for index, file_info in pregen_dict.items(): pregen_name = os.path.join(self._pregen_dir, file_info[0]) print("writing pregen", index, "name=", pregen_name) with open(pregen_name, "w") as fw: fw.write(file_info[1]) # Start creating and ingesting chunks. loop = True while loop: self._cl_conn.clientReqChunks(self._chunksPerReq) transaction_id, chunkListRecv, problem = self._cl_conn.clientRecvChunks( ) self._transaction_id = transaction_id print("transaction_id = ", self._transaction_id) if problem: print("WARN there was a problem with", chunkListRecv) chunkRecvSet = set(chunkListRecv) if len(chunkRecvSet) == 0: # no more chunks, close the connection print("No more chunks to create, exiting") loop = False break withOverlapChunks = [] ingestedChunks = [] # Create chunks received in the list st_time = self._timing_dict.start() createdChunks = self._createRecvChunks(chunkRecvSet) self._timing_dict.end("primeChunks", st_time) st_time = self._timing_dict.start() # Create edge only chunks as needed. st_time = self._timing_dict.start() haveAllCsvChunks = self._createNeighborChunks(createdChunks) self._timing_dict.end("neighborChunks", st_time) # Generate overlap tables and files for ingest (happens within # the transaction). # Start the transaction abort = False try: for chunk in haveAllCsvChunks: self._createOverlapTables(chunk) print("created overlap for chunk", chunk) withOverlapChunks.append(chunk) except Exception as exc: # Abort the transaction if possible. print("ERROR transaction failed ", exc) traceback.print_exc() abort = True if not abort: for chunk in withOverlapChunks: ingestedChunks.append(chunk) else: print("Failed ingest, transaction failed or aborted ") # If no chunks were created, likely fatal error. Asking for more # chunks to create would just cause more problems. if len(ingestedChunks) == 0: print( "ERROR no chunks were successfully ingested, ending program" ) loop = False # client sends timing info back to server. print(self._timing_dict.report()) self._cl_conn.clientReportTiming(self._timing_dict) self._timing_dict = TimingDict() # Client sends the list of completed chunks back self._sendIngestedChunksToServer(ingestedChunks) # Remove files and directories if specified if not self._keep_csv: self.deleteAllKeepConfig()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: DataGenServer.py Projeto: lsst-dm/dax_data_generator

class DataGenServer: """This class is meant to provide clients with the information needed to generate chunks. Parameters ---------- cfg_file_name : string The name of the server configuration file chunk_logs_in : ChunkLogs Data from previously generated log files or user input that identifies which chunks should be generated. log_dir : str Directory where chunk logs will be written. If it is None, no log files will be written. Empty string is valid. skip_ingest : bool When true, do not try to pass generated files to the ingest system. skip_schema : bool When true, expect attempts to send schemas to ingest to fail. keep_csv : bool When true, hold onto intermediate files and directories instead of deleting them. Notes ----- This class is meant to provide clients with names, fake data configuration, and chunks that need to be generated. While also keeping track of what has been generated where. The replicator should be able to identify duplicate chunks and mismatching chunks, so this process will not be concerned with that, but will avoid generating duplicates when possible. cfg_file_name contains our port number and the command line arguments to be sent to the fake data generating program. The contents of fake_cfg_file_name will be copied to the clients and passed to the fake data genrating program. Failures creating this object should terminate the program. """ def __init__(self, cfg_file_name, chunk_logs_in, log_dir, skip_ingest, skip_schema, keep_csv): self._cfgFileName = cfg_file_name # base directory for other configuration files self._base_cfg_dir = os.path.dirname(self._cfgFileName) print("base_cfg_dir=", self._base_cfg_dir) # Set of all chunkIds to generate. sphgeom::Chunker is used to limit # the list to valid chunks. self._skip_ingest = skip_ingest self._skip_schema = skip_schema self._keep_csv = keep_csv # Set to false to stop accepting and end the program self._loop = True # Sequence count, incremented to provide unique client names self._sequence = 1 # lock to protect _sequence, _clients self._client_lock = threading.Lock() # Store timing data from clients self._timing_dict = TimingDict() self._times_lock = threading.Lock() # Read configuration to set other values. with open(self._cfgFileName, 'r') as cfgFile: self._cfg = yaml.load(cfgFile) print("cfg", self._cfg) # The port number the host will listen to. self._port = self._cfg['server']['port'] # The arguments that will be passed from server to # clients to dax_data_generator/bin/datagen.py. self._visits = self._cfg['fakeDataGenerator']['visits'] self._objects = self._cfg['fakeDataGenerator']['objects'] self._seed = self._cfg['fakeDataGenerator']['seed'] print( f'port={self._port} objects={self._objects} visits={self._visits}') # The name and contents of the configuration file that will be passed # from server to clients to dax_data_generator/bin/datagen.py. fake_cfg_file_name = os.path.join( self._base_cfg_dir, self._cfg['fakeDataGenerator']['cfgFileName']) print("fake_cfg_file_name", fake_cfg_file_name) with open(fake_cfg_file_name, 'r') as file: self._fakeCfgData = file.read() print("fake_cfg_data=", self._fakeCfgData) # Get the directory containing partioner configuration files. partioner_cfg_dir = os.path.join(self._base_cfg_dir, self._cfg['partitioner']['cfgDir']) print("partioner_cfg_dir=", partioner_cfg_dir) # Read all the files in that directory and their contents. self._partioner_cfg_dict = self._readPartionerCfgDir(partioner_cfg_dir) # Get ingest sytem information transaction_size = self._cfg['fakeDataGenerator']['transaction_size'] self._db_name = self._cfg['ingest']['dbName'] ingest_host = self._cfg['ingest']['host'] ingest_port = self._cfg['ingest']['port'] ingest_auth = self._cfg['ingest']['authKey'] if ingest_auth is None: ingest_auth = '' self._ingest_dict = { 'host': ingest_host, 'port': ingest_port, 'auth': ingest_auth, 'db': self._db_name, 'skip': self._skip_ingest, 'keep': self._keep_csv } # Read ingest config files. self._ingest_cfg_dir = os.path.join(self._base_cfg_dir, self._cfg['ingest']['cfgDir']) print("ingest addr=", ingest_host, ":", ingest_port) print("ingest cfg dir=", self._ingest_cfg_dir) self._ingest = DataIngest(ingest_host, ingest_port, ingest_auth) # List of client connection threads self._client_threads = [] # Dictionary of clients by client_id self._clients = {} # Build dictionary of info for chunks to send to workers. spec_globals = {} exec(self._fakeCfgData, spec_globals) assert 'spec' in spec_globals, "Specification file must define a variable 'spec'." assert 'chunker' in spec_globals, "Specification file must define a variable 'chunker'." # Determine pregenerated file directory pregenerated_dir = os.path.join(self._base_cfg_dir, self._cfg['pregenerated']['cfgDir']) # Find all tables that have "from_file" defined and put them in a list so they can be sent. self._pregen_file_dict = self._readPreGeneratedFiles( pregenerated_dir, spec_globals['spec']) # Read in chunker info chunker = spec_globals['chunker'] self._chunk_tracking = ChunkTracking(chunker, chunk_logs_in, transaction_size, skip_ingest, skip_schema, log_dir, self._ingest_dict) # Track all client connections so it is possible to # determine when the server's job is finished. self._active_client_count = 0 self._active_client_mtx = threading.Lock() def chunksToSendTotal(self): """Return the total number of chunks to send. """ return self._chunk_tracking.get_chunks_to_send_total() def _readPartionerCfgDir(self, partioner_cfg_dir): """Read in all the files ending with cfg in partioner_cfg_dir. Parameters ---------- partioner_cfg_dir : string The directory containing partioner config files. Returns ------- dictionary : Keys are sequential integers starting at 0 Values are tuples of file name and file contents. Note ---- All the files ending with '.csv' will be read in and entries for them will be put in a dictionary with integer keys, and values being a tuple of the file name and file contents. The keys must be sequential and start at 0, as the clients ask for them by by number starting at 0. """ entries = os.listdir(partioner_cfg_dir) files = [] for e in entries: if os.path.isfile(os.path.join(partioner_cfg_dir, e)): ext = os.path.splitext(e)[1] if ext == '.cfg': files.append(os.path.basename(e)) print("partitionCfg files=", files, entries) file_dict = {} index = 0 for f in files: fName = os.path.join(partioner_cfg_dir, f) with open(fName, 'r') as file: file_data = file.read() file_dict[index] = (f, file_data) index += 1 print("file_dict", file_dict) return file_dict def _readPreGeneratedFiles(self, pregenerated_dir, spec_globals): """ Read in pregenerated files. Parameters ---------- pregenerated_dir : str Directory where all pregenerated files can be found. spec_globals : dictionary Configuration dictionary containing the specifications for the tables that need to be generated. Note ---- All tables with "from_table" defined in spec_globals will get an entry in this dictionary. Any problems finding the files will raise an exception and likely crash the server. """ pregen_file_names = [] for tbl in spec_globals: if "from_file" in spec_globals[tbl]: pregen_file_names.append(spec_globals[tbl]["from_file"]) file_dict = {} index = 0 for f in pregen_file_names: fname = os.path.join(pregenerated_dir, f) with open(fname, 'r') as file: contents = file.read() file_dict[index] = (f, contents) index += 1 print("pregenerated rows=", len(file_dict)) return file_dict def _servAccept(self): """Accept connections from clients, spinning up a new thread to handle each one. This ends when there are no more chunk ids to send and all threads have joined. """ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(('', self._port)) s.listen() while self._loop: conn, addr = s.accept() print('Connected by', addr) if self._loop: # start new thread self._client_lock.acquire() clientName = 'client' + str(self._sequence) self._sequence += 1 self._client_lock.release() print("starting thread", clientName, conn, addr) thrd = threading.Thread(target=self._servToClient, args=(clientName, conn, addr)) self._client_threads.append(thrd) with self._active_client_mtx: self._active_client_count += 1 thrd.start() print("Accept loop shutting down") for j, thrd in enumerate(self._client_threads): print("joining thread", j) thrd.join() print("All threads joined.") def _servToClient(self, name, conn, addr): """Handle the requests of a single client. Parameters ---------- name : string The client's name. conn : socket connection The socket connection to the client. addr : string The IP address of the client. Notes ----- The request from the client follow the pattern: Initialize - provide client with its name, and command line arguments with the configuration files for datagen.py, sph-partition, etc. Repeated until the client disconnects- Requests for chunkIds to generate - The client will disconnect if the server sends it an empty list of chunkIds. Client responds with successfully generated chunkIds. Any chunkIds assigned to the client but not in the list of commpleted chunks are put in LIMBO. """ # Connection and communication exceptions are caught so # other connections can continue. out_of_chunks = False try: print('Connected by', addr, name, conn) sv_conn = DataGenConnection(conn) with self._client_lock: self._clients[name] = addr # receive init from client sv_conn.servReqInit() # server sending back configuration information sv_conn.servRespInit(name, self._objects, self._visits, self._seed, self._fakeCfgData, self._ingest_dict) # client requests partioner configuration files sv_conn.servSendFiles(self._partioner_cfg_dict) # Send the pregenerated files to the client sv_conn.servSendFiles(self._pregen_file_dict) # client requesting chunk list client_times = None transaction_id = -9999999 # Obviously invalid value, must be negative. while self._loop and not out_of_chunks: clientReqChunkCount = sv_conn.servRecvReqChunks() chunksForClient, transaction_id = self._chunk_tracking.get_chunks_for_client( name, addr, clientReqChunkCount) sv_conn.servSendChunks(chunksForClient, transaction_id) if len(chunksForClient) == 0: print("out of chunks to send, nothing more to send") out_of_chunks = True conn.close() else: # receive timing information from client client_times = sv_conn.servRecvTiming() print("client times ", client_times.report()) if client_times: with self._times_lock: self._timing_dict.combine(client_times) # receive completed chunks from client completed_chunks = [] finished = False while not finished: completedC, finished, problem = sv_conn.servRecvChunksComplete( ) print("serv got", completedC, finished, problem) completed_chunks.extend(completedC) # Pass the client results to chunk tracking self._chunk_tracking.client_results( transaction_id, chunksForClient, completed_chunks) except socket.gaierror as e: print("breaking connection", addr, name, "socket.gaierror:", e) self._chunk_tracking.abort_and_close(transaction_id) except socket.error as e: print("breaking connection", addr, name, "socket.error:", e) self._chunk_tracking.abort_and_close(transaction_id) except DataGenError as e: print("breaking connection", addr, name, "DataGenError:", e.msg) self._chunk_tracking.abort_and_close(transaction_id) print("_servToClient loop is done", addr, name) # Decrement the number of running client connections and # possibly end the program. with self._active_client_mtx: self._active_client_count -= 1 if self._active_client_count == 0 and out_of_chunks: # Connect to our own socket to get past the accept # and break the loop. self._loop = False with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as termSock: termSock.connect(('127.0.0.1', self._port)) def connectToIngest(self): """Test if ingest is available and send database info if it is. Return ------ success : bool False if required information could not be sent to ingest. _skip_ingest and _skip_schema can reduce or eliminate the information that needs to be sent. """ if self._skip_ingest: print("Skipping ingest connect") return True if not self._ingest.isIngestAlive(): raise RuntimeError("Failed to contact ingest", self._ingest) if self._skip_schema: print("Skipping database and schema file ingest.") return True db_jfile = self._db_name + ".json" db_jpath = os.path.join(self._ingest_cfg_dir, db_jfile) print("sending db config to ingest", db_jpath) if not self._ingest.registerDatabase(db_jpath): raise RuntimeError("Failed to send database to ingest.", db_jpath, self._ingest) # Find all of the schema files in self._ingest_cfg_dir while # ignoring the database config file and file names ending in '_template'. entries = os.listdir(self._ingest_cfg_dir) files = [] for e in entries: # Skip '_template.json' files reg = re.compile(r".*_template\.json$") m = reg.match(e) if m: continue full_path = os.path.join(self._ingest_cfg_dir, e) if os.path.isfile(full_path): ext = os.path.splitext(e)[1] if ext == '.json': fname = os.path.basename(e) if not fname == db_jfile: files.append(full_path) # Send each config file to ingest for f in files: print("Sending schema file to ingest", f) if not self._ingest.registerTable(f): raise RuntimeError("Failed to send schema file to ingest", f) return True def start(self): """Start the server and print the results. """ print("Registering database and schema with ingest system.") self.connectToIngest() print("starting") self._servAccept() print("Done, generated ", self._chunk_tracking.get_total_chunks_generated()) print( "chunks failed chunks:", self._chunk_tracking.chunksInState( [GenerationStage.LIMBO, GenerationStage.ASSIGNED])) print(self._chunk_tracking._chunk_logs.report()) info_r = self._chunk_tracking.get_chunk_info_report() print(info_r) print("\n", self._timing_dict.report()) # Publish database if all chunks were generated. if self._skip_ingest: print("skipping publishing") return if self._chunk_tracking.is_successful_ingest(): print("All chunks generated and ingested, publishing", self._db_name) success, status, r_json = self._ingest.publishDatabase( self._db_name) if success: print("Published", self._db_name) else: print("ERROR failed to publish", self._db_name, status, r_json) else: print("Not publishing due to incomplete data/creation/ingestion")