def check_tar_status_and_delete(db, record, days=60, dryrun=False): """Check run.tar status """ run_num = record['run'] if record['deletion'].get('tar') == "locked": LOGGER.info("%s tar ball creation in progress", run_num) return None relative_days = relative_isoformat_time( record['deletion'].get('timestamp_tar')) if relative_days > days: if record['deletion'].get('status') == "locked": LOGGER.info("Deletion of %s tar ball in progress", run_num) return None if dryrun: LOGGER.info("Skipping Deletion of %s due to dryrun option", run_num) return #set deletion.status = locked, update deletion.timestamp res = db.update_one({"run": run_num}, { "$set": { "deletion.status": "locked", "deletion.timestamp": generate_timestamp() } }) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format(res.modified_count)) #delete tar ball tar_file = record['deletion'].get('tar') assert os.path.exists( tar_file), "The run directory {} does not exists".format(tar_file) try: os.remove(tar_file) except OSError as e: LOGGER.critical("Error: %s - %s.", e.filename, e.strerror) #unset deletion.tar and deletion.timestamp_tar res = db.update_one( {"run": run_num}, {"$unset": { "deletion.tar": "", "deletion.timestamp_tar": "" }}) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format(res.modified_count)) #set deletion.status = deleted, update deletion.timestamp res = db.update_one({"run": run_num}, { "$set": { "deletion.status": "deleted", "deletion.timestamp": generate_timestamp() } }) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format(res.modified_count)) LOGGER.info("Deleted the tar ball for %s ", run_num) return True
def put_file_into_database(self, name, file): if self.__fs.exists({"filename": name}): fileToDeleteID = self.__fs.find_one({"filename": name})._id self.__fs.delete(fileToDeleteID) self.__fs.put(file, filename=name, uploadDateCET=generate_timestamp()) else: self.__fs.put(file, filename=name, uploadDateCET=generate_timestamp())
def create_run_tar(db, run_num): """compress bcl directory into a tar ball """ #Set deletion.tar update timestamp res = db.update_one({"run": run_num}, { "$set": { "deletion.tar": "locked", "deletion.timestamp_tar": generate_timestamp() } }) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format(res.modified_count)) #Create tar ball and md5sum rundir = get_bcl_runfolder_for_runid(run_num) assert os.path.isdir( rundir), "The run directory {} does not exists".format(rundir) run_tar = "/mnt/projects/userrig/BENCHMARK_testing/test/" + run_num + ".tar" LOGGER.info("compression started %s ", run_tar) with tarfile.open(run_tar, "x") as tar: tar.add(rundir) md5sum_cmd = 'md5sum %s' % (run_tar) dest_md5sum = "/mnt/projects/userrig/BENCHMARK_testing/test/" + run_num + ".md5sum" assert os.path.exists(run_tar), "Tar ball {} does not exists".format( run_tar) try: f = open(os.path.join(dest_md5sum), "w") _ = subprocess.call(md5sum_cmd, shell=True, stderr=subprocess.STDOUT, stdout=f) LOGGER.info("compression completed %s ", run_num) #Delete bcl directory ## FIXME finally #shutil.rmtree(rundir) except (subprocess.CalledProcessError, OSError) as e: LOGGER.fatal("The following command failed with return code %s: %s", e.returncode, ' '.join(md5sum_cmd)) LOGGER.fatal("Output: %s", e.output.decode()) LOGGER.fatal("Exiting") sys.exit(1) LOGGER.info("Deletion of bcl directory completed for %s ", run_num) #set deletion.tar = filename, update deletion.timestamp res = db.update_one({"run": run_num}, { "$set": { "deletion.tar": run_tar, "deletion.timestamp_tar": generate_timestamp() } }) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format(res.modified_count))
def __init__(self, project_name, username, config_path): """ The constructor initializes the config file, the project and the user name. :param project_name: Project name to be assigned :param username: Username to be attached to this project. This will later on be used for security purposes :param config_path: Path of the config """ config_parser = ConfigHandler(config_path, project_name=project_name) self.compute_config = config_parser.get_compute_config() self.storage_config = config_parser.get_storage_config() self.queue_config = config_parser.get_queue_config() self.master_node_config = config_parser.get_master_node_config() self.compute_ports = config_parser.get_ports() self.project_name = project_name self.username = username self.experiment_id = generate_timestamp("experiment") logger.info(f"Experiment ID: {self.experiment_id}\n") self.experiment_dir = self.initialize_folders() self._initialize_bucket_structure() self.initialize_queue() self._create_completion_submission_docker_compose() self.completion_service_process = \ self.initialize_completion_service() self.compute_managers = {} self.create_instances()
def from_consumer_and_token( oauth_consumer, token=None, callback=None, verifier=None, http_method=HTTP_METHOD, http_url=None, parameters=None, ): if not parameters: parameters = {} defaults = { "oauth_consumer_key": oauth_consumer.key, "oauth_timestamp": generate_timestamp(), "oauth_nonce": generate_nonce(), "oauth_version": OAuthRequest.version, } defaults.update(parameters) parameters = defaults if token: parameters["oauth_token"] = token.key if token.callback: parameters["oauth_callback"] = token.callback # 1.0a support for verifier. if verifier: parameters["oauth_verifier"] = verifier elif callback: # 1.0a support for callback in the request token request. parameters["oauth_callback"] = callback return OAuthRequest(http_method, http_url, parameters)
async def send_speech_config_msg(self): # assemble the payload for the speech.config message context = { 'system': { 'version': '5.4' }, 'os': { 'platform': platform.system(), 'name': platform.system() + ' ' + platform.version(), 'version': platform.version() }, 'device': { 'manufacturer': 'SpeechSample', 'model': 'SpeechSample', 'version': '1.0.00000' } } payload = {'context': context} # assemble the header for the speech.config message msg = 'Path: speech.config\r\n' msg += 'Content-Type: application/json; charset=utf-8\r\n' msg += 'X-Timestamp: ' + utils.generate_timestamp() + '\r\n' # append the body of the message msg += '\r\n' + json.dumps(payload, indent=2) # DEBUG PRINT # print('>>', msg) await self.ws.send(msg)
async def send_audio_msg(self, audio_file_path): # open the binary audio file with open(audio_file_path, 'rb') as f_audio: num_chunks = 0 while True: # read the audio file in small consecutive chunks audio_chunk = f_audio.read(self.chunk_size) if not audio_chunk: break num_chunks += 1 # assemble the header for the binary audio message msg = b'Path: audio\r\n' msg += b'Content-Type: audio/x-wav\r\n' msg += b'X-RequestId: ' + bytearray(self.request_id, 'ascii') + b'\r\n' msg += b'X-Timestamp: ' + bytearray(utils.generate_timestamp(), 'ascii') + b'\r\n' # prepend the length of the header in 2-byte big-endian format msg = len(msg).to_bytes(2, byteorder='big') + msg # append the body of the message msg += b'\r\n' + audio_chunk # DEBUG PRINT # print('>>', msg) # sys.stdout.flush() try: await self.ws.send(msg) # DEBUG CONCURRENCY # await asyncio.sleep(0.1) except websockets.exceptions.ConnectionClosed as e: print('Connection closed: {0}'.format(e)) return
def recordTelemetry(self, response_path): if response_path not in [ next(iter(msg.keys())) for msg in self.received_messages ]: self.received_messages.append( {response_path: utils.generate_timestamp()}) else: for i, msg in enumerate(self.received_messages): if next(iter(msg.keys())) == response_path: if not isinstance(msg[response_path], list): self.received_messages[i][response_path] = [ msg[response_path] ] self.received_messages[i][response_path].append( utils.generate_timestamp()) break
def purge(db, runid_and_flowcellid, mail_to): """ purging bcl data from /mnt/seq/novogene """ rundir = get_bcl_runfolder_for_runid(runid_and_flowcellid) if not os.path.exists(rundir): LOGGER.critical("Run directory '%s' does not exist.\n", rundir) return # Sanity checks for Sequencing run assert os.path.exists(os.path.join(rundir, 'RunInfo.xml')), \ "No RunInfo.xml found under {}".format(rundir) stat_info = os.stat(rundir) #Check if uid is novogene (925) assert stat_info.st_uid == 925, "The run {} does not belong to Novogene user".format( rundir) try: start_time = generate_timestamp() res = db.update_one({"run": runid_and_flowcellid}, \ {"$set": \ {"raw-delete": { \ "start_time" : start_time, \ "Status" : "STARTED", \ }}}) assert res.modified_count == 1, ("Modified {} documents instead of 1". \ format(res.modified_count)) #FIXME for production release #shutil.rmtree(rundir) end_time = generate_timestamp() res = db.update_one({"run": runid_and_flowcellid}, {"$set": {"raw-delete.Status": "SUCCESS", \ "raw-delete.end_time": end_time}}) assert res.modified_count == 1, ("Modified {} documents instead of 1". \ format(res.modified_count)) subject = "bcl deletion: {}".format(runid_and_flowcellid) body = "Bcl deletion completed successfully from {}".format(rundir) send_mail(subject, body, toaddr=mail_to) except OSError: LOGGER.critical("Error happened while deleting '%s'", rundir) res = db.update_one({"run": runid_and_flowcellid}, \ {"$unset": {"raw-delete": ""}}) assert res.modified_count == 1, ("Modified {} documents instead of 1". \ format(res.modified_count)) subject = "Error: bcl deletion {}".format(runid_and_flowcellid) body = "Error happened while deleting raw data under {}".format(rundir) send_mail(subject, body, toaddr=mail_to)
def __record_telemetry(self, response_path): # if a single message of a certain type, store the value directly if response_path not in [ next(iter(msg.keys())) for msg in self.received_messages ]: self.received_messages.append( {response_path: utils.generate_timestamp()}) # if multiple messages of a certain type, store the values in a list else: for i, msg in enumerate(self.received_messages): if next(iter(msg.keys())) == response_path: if not isinstance(msg[response_path], list): self.received_messages[i][response_path] = [ msg[response_path] ] self.received_messages[i][response_path].append( utils.generate_timestamp()) break
def relative_isoformat_time(last_analysis): """ Relative isoformat_time """ analysis_epoch_time = isoformat_to_epoch_time(last_analysis+"+08:00") epoch_time_now = isoformat_to_epoch_time(generate_timestamp()+"+08:00") rd = relative_epoch_time(epoch_time_now, analysis_epoch_time) relative_days = rd.months*30 + rd.days return relative_days
async def connect_to_speech_api(self, language, response_format, recognition_mode): self.language = language self.response_format = response_format self.recognition_mode = recognition_mode # determine the endpoint based on the selected recognition mode endpoint = self.__get_cur_endpoint() if endpoint is None: print('Error: invalid recognition mode.') return # assemble the URL and the headers for the connection request url = endpoint + '?language={0}&format={1}'.format( self.language, self.response_format) headers = { 'Authorization': 'Bearer ' + self.auth_token, 'X-ConnectionId': self.connection_id } # record the Connection metric telemetry data self.metrics.append({ 'Name': 'Connection', 'Id': self.connection_id, 'Start': utils.generate_timestamp() }) try: # request a WebSocket connection to the speech API print(endpoint) print(headers) self.ws = await websockets.client.connect(url, extra_headers=headers) except websockets.exceptions.InvalidHandshake as err: print('Handshake error: {0}'.format(err)) return # TODO: add Connection failure telemetry for error cases # record the Connection metric telemetry data self.metrics[-1]['End'] = utils.generate_timestamp() # send the speech.config message await self.send_speech_config_msg()
async def connectAPI(self): endpoint = endpoints_ws[self.recognition_mode] url = endpoint headers = { 'Authorization': 'Bearer ' + self.auth_token, 'X-ConnectionId': self.connection_id } self.metrics.append({ 'Name': 'Connection', 'Id': self.connection_id, 'Start': utils.generate_timestamp() }) try: self.ws = await websockets.client.connect(url, extra_headers=headers) except websockets.exceptions.InvalidHandshake as err: print('Handshake error: {0}'.format(err)) return self.metrics[-1]['End'] = utils.generate_timestamp() await self.sendSpeechConfig()
def write(self, dirname, dbid, timestamp=None): """Write starter flag file """ if not timestamp: timestamp = generate_timestamp() self.timestamp = timestamp self.dbid = dbid self.filename = os.path.join( dirname, self.pattern.format(timestamp=self.timestamp)) assert not os.path.exists(self.filename), ( "StartFlag {} already exists".format(self.filename)) with open(self.filename, 'w') as fh: fh.write(dbid)
def get_downstream_outdir(requestor, pipeline_name, pipeline_version=None): """generate downstream output directory """ if is_devel_version(): basedir = site_cfg['downstream_outdir_base']['devel'] else: basedir = site_cfg['downstream_outdir_base']['production'] if pipeline_version: pversion = pipeline_version else: pversion = get_pipeline_version(nospace=True) outdir = DOWNSTREAM_OUTDIR_TEMPLATE.format( basedir=basedir, user=requestor, pipelineversion=pversion, pipelinename=pipeline_name, timestamp=generate_timestamp()) return outdir
async def sendTelemetry(self, is_first_turn=False): payload = {'ReceivedMessages': self.received_messages} if is_first_turn: payload['Metrics'] = self.metrics msg = 'Path: telemetry\r\n' msg += 'Content-Type: application/json; charset=utf-8\r\n' msg += 'X-RequestId: ' + self.request_id + '\r\n' msg += 'X-Timestamp: ' + utils.generate_timestamp() + '\r\n' msg += '\r\n' + json.dumps(payload, indent=2) try: await self.ws.send(msg) except websockets.exceptions.ConnectionClosed as e: print('Connection closed: {0}'.format(e)) return
def bundle_and_clean_logs(pipeline_outdir, result_outdir="out/", log_dir="logs/", overwrite=False): """bundle log files in pipeline_outdir+result_outdir and pipeline_outdir+log_dir to pipeline_outdir+logs.tar.gz and remove See http://stackoverflow.com/questions/40602894/access-to-log-files for potential alternatives """ for d in [ pipeline_outdir, os.path.join(pipeline_outdir, result_outdir), os.path.join(pipeline_outdir, log_dir) ]: if not os.path.exists(d): logger.warning("Missing directory %s. Skipping log bundling.", d) return bundle = os.path.join(log_dir, "logs.tar.gz") # relative to pipeline_outdir if not overwrite and os.path.exists(os.path.join(pipeline_outdir, bundle)): bundle = os.path.join(log_dir, "logs.{}.tar.gz".format(generate_timestamp())) assert not os.path.exists(os.path.join(pipeline_outdir, bundle)) orig_dir = os.getcwd() os.chdir(pipeline_outdir) # all log files associated with output files logfiles = glob.glob(os.path.join(result_outdir, "**/*.log"), recursive=True) # (cluster) log directory logfiles.extend(glob.glob(os.path.join(log_dir, "*"))) # paranoid cleaning and some exclusion logfiles = [ f for f in logfiles if os.path.isfile(f) and not f.endswith("snakemake.log") ] with tarfile.open(bundle, "w:gz") as tarfh: for f in logfiles: tarfh.add(f) os.unlink(f) os.chdir(orig_dir)
def __init__( self, script_name, # used as logging prefix. can be dummy pipeline_name, pipeline_version, submitter, site, instance_id, log_path, # main logging file elm_units): """FIXME:add-doc""" assert isinstance(elm_units, list) elmlogdir = os.getenv('RPD_ELMLOGDIR') assert elmlogdir, ("RPD_ELMLOGDIR undefined") pipelogdir = os.path.join(elmlogdir, pipeline_name) assert os.path.exists(pipelogdir), ( "pipeline log dir {} doesn't exist".format(pipelogdir)) # timestamp just a way to make it unique logfile = os.path.join(pipelogdir, generate_timestamp() + ".log") assert not os.path.exists(logfile) self.logfile = logfile # only used as logging prefix (not even parsed by ELM) self.script_name = script_name # json-like values #self.fields = OrderedDict() self.fields = dict() # caller provided self.fields['pipeline_name'] = pipeline_name self.fields['pipeline_version'] = pipeline_version self.fields['site'] = site self.fields['instance_id'] = instance_id self.fields['submitter'] = submitter self.fields['log_path'] = log_path # internally computed self.fields['status_id'] = None self.elm_units = elm_units
def get_bcl2fastq_outdir(runid_and_flowcellid): """where to write bcl2fastq output to """ if is_devel_version(): basedir = site_cfg['bcl2fastq_outdir_base']['devel'] else: basedir = site_cfg['bcl2fastq_outdir_base']['production'] machineid, runid, flowcellid = get_machine_run_flowcell_id( runid_and_flowcellid) outdir = "{basedir}/{mid}/{rid}_{fid}/bcl2fastq_{ts}".format( basedir=basedir, mid=machineid, rid=runid, fid=flowcellid, ts=generate_timestamp()) return outdir
def update_run_status(mongo_status_script, run_num, outdir, status, testing): """Update run status in the mongoDB """ logger.info("Setting analysis for %s to %s", run_num, status) analysis_id = generate_timestamp() mongo_update_cmd = [mongo_status_script, "-r", run_num, "-s", status] mongo_update_cmd.extend(["-a", analysis_id, "-o", outdir]) if testing: mongo_update_cmd.append("-t") try: _ = subprocess.check_output(mongo_update_cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logger.fatal("The following command failed with return code %s: %s", e.returncode, ' '.join(mongo_update_cmd)) logger.fatal("Output: %s", e.output.decode()) logger.fatal("Exiting") sys.exit(1) flagfile = os.path.join(outdir, "SEQRUNFAILED") logger.info("Creating flag file %s", flagfile) with open(flagfile, 'w') as _: pass
async def sendAudio(self, audio_file_path): with open(audio_file_path, 'rb') as f_audio: num_chunks = 0 while True: audio_chunk = f_audio.read(self.chunk_size) if not audio_chunk: break num_chunks += 1 msg = b'Path: audio\r\n' msg += b'Content-Type: audio/x-wav\r\n' msg += b'X-RequestId: ' + bytearray(self.request_id, 'ascii') + b'\r\n' msg += b'X-Timestamp: ' + bytearray(utils.generate_timestamp(), 'ascii') + b'\r\n' msg = len(msg).to_bytes(2, byteorder='big') + msg msg += b'\r\n' + audio_chunk try: await self.ws.send(msg) except websockets.exceptions.ConnectionClosed as e: print('Connection closed: {0}'.format(e)) return
async def send_telemetry_msg(self, is_first_turn=False): # assemble the payload for the telemetry message payload = {'ReceivedMessages': self.received_messages} if is_first_turn: payload['Metrics'] = self.metrics # assemble the header for the speech.config message msg = 'Path: telemetry\r\n' msg += 'Content-Type: application/json; charset=utf-8\r\n' msg += 'X-RequestId: ' + self.request_id + '\r\n' msg += 'X-Timestamp: ' + utils.generate_timestamp() + '\r\n' # append the body of the message msg += '\r\n' + json.dumps(payload, indent=2) # DEBUG PRINT # print('>>', msg) # sys.stdout.flush() try: await self.ws.send(msg) except websockets.exceptions.ConnectionClosed as e: print('Connection closed: {0}'.format(e)) return
def runs_from_db(db, days=75, win=34): """Get the runs from pipeline_run collections""" epoch_present, epoch_back = generate_window(win) results = db.find({ "run": { "$regex": "^NG00" }, "raw-delete": { "$exists": False }, "timestamp": { "$gt": epoch_back, "$lt": epoch_present } }) LOGGER.info("Found %d runs for last %s days", results.count(), win) for record in results: LOGGER.debug("record: %s", record) if not record.get('run'): LOGGER.critical("run is missing for DB-id %s", record['_id']) continue runid_and_flowcellid = (record['run']) results = db.find({"run": runid_and_flowcellid}) if not 'analysis' in record: continue last_analysis = record['analysis'][-1] status = last_analysis.get("Status") end_time = last_analysis.get("end_time") if not status or not end_time: continue analysis_epoch_time = isoformat_to_epoch_time(end_time + "+08:00") epoch_time_now = isoformat_to_epoch_time(generate_timestamp() + "+08:00") rd = relative_epoch_time(epoch_time_now, analysis_epoch_time) relative_days = rd.months * 30 + rd.days if status == 'SUCCESS' and relative_days > days: yield runid_and_flowcellid
async def sendSpeechConfig(self): context = { 'system': { 'version': '5.4' }, 'os': { 'platform': platform.system(), 'name': platform.system() + ' ' + platform.version(), 'version': platform.version() }, 'device': { 'manufacturer': 'SpeechSample', 'model': 'SpeechSample', 'version': '1.0.00000' } } payload = {'context': context} msg = 'Path: speech.config\r\n' msg += 'Content-Type: application/json; charset=utf-8\r\n' msg += 'X-Timestamp: ' + utils.generate_timestamp() + '\r\n' msg += '\r\n' + json.dumps(payload, indent=2) await self.ws.send(msg)
parser.add_argument("-m", "--model", help="path to word2vec/model/timestamp.model") parser.add_argument("-p", "--project", help="path to validation_set.tsv and submissions") args = parser.parse_args() # Load model model = LsiModel.load(args.model, mmap='r') # Load validation set and advance 1 line validation_set = open("%s/validation_set.tsv" % args.project) validation_set.readline() output = open("%s/%s_submission.csv" % (args.project, generate_timestamp()), "w") output.write("id,correctAnswer\n") for line in validation_set: elements = line.split("\t") question_id = elements.pop(0) # Get bag-of-words representation of question and answers doc_vectors = [ model.id2word.doc2bow(element.split()) for element in elements ] question = doc_vectors.pop(0) # Generate list of tuples: # (Cosine similarity, mapped index 0-3 to A-D)
def mark_as_completed(): """Dropping a flag file marking analysis as complete""" analysis_dir = os.getcwd() flag_file = os.path.join(analysis_dir, WORKFLOW_COMPLETION_FLAGFILE) with open(flag_file, 'a') as fh: fh.write("{}\n".format(generate_timestamp()))
def run_inference_engine(self, model_name: str, model_dir: str, to_csv: bool = False, output_dir: str = None, load_from_s3: bool = False, creds: Dict = None) -> pd.DataFrame: """Conducts inference using the test set. Arguments: model_name {str} -- Name of the trained model. model_dir {str} -- Path to where the model is stored. Keyword Arguments: to_csv {bool} -- Save to csv file (default: {False}) output_dir {str} -- Path to output directory (default: {None}) load_from_s3 {bool} -- Load trained model from s3 bucket (default: {False}) creds {Dict} -- Dictionary containing AWS credentials. Requires aws_access_key_id, aws_secret_access_key, bucket. (default: {None}) E.g. CREDENTIALS = {} CREDENTIALS['aws_access_key_id'] = os.environ.get("aws_access_key_id") CREDENTIALS['aws_secret_access_key'] = os.environ.get("aws_secret_access_key") CREDENTIALS['bucket'] = os.environ.get("bucket") Returns: submission_df {pd.DataFrame} -- A predictions dataframe ready for submission to the public leaderboard. """ def _conduct_inference() -> defaultdict: predictions = defaultdict(list) testing_loaders = self._get_all_testing_loaders() for loader in testing_loaders: for batch, data in enumerate(loader): image = self.trainer._load_to_gpu_float(data["image"]) grapheme, vowel, consonant = self.trainer.model(image) for idx, img_id in enumerate(data["image_id"]): predictions["grapheme"].append( grapheme[idx].cpu().detach().numpy()) predictions["vowel"].append( vowel[idx].cpu().detach().numpy()) predictions["consonant"].append( consonant[idx].cpu().detach().numpy()) predictions["image_id"].append(img_id) return predictions def _get_maximum_probs(preds: defaultdict) -> Dict: return { "final_grapheme": np.argmax(np.mean(preds["grapheme"], axis=0), axis=1), "final_vowel": np.argmax(np.mean(preds["vowel"], axis=0), axis=1), "final_consonant": np.argmax(np.mean(preds["consonant"], axis=0), axis=1), "image_ids": preds["image_id"] } def _create_submission_df(pred_dict: Dict) -> pd.DataFrame: predictions = [] for idx, image_id in enumerate(pred_dict["image_ids"]): predictions.append((f"{image_id}_grapheme_root", pred_dict["final_grapheme"][idx])) predictions.append((f"{image_id}_vowel_diacritic", pred_dict["final_vowel"][idx])) predictions.append((f"{image_id}_consonant_diacritic", pred_dict["final_consonant"][idx])) return pd.DataFrame(predictions, columns=["row_id", "target"]) final_predictions = defaultdict(list) for idx in range(1, self.params["test_loops"]): LOGGER.info(f'Conducting inference for fold {idx}') model_name_path = f'{model_name}_bengali_fold{idx}.pth' model_state_path = f'{model_dir}/{model_name_path}' if load_from_s3: self.trainer.load_model_from_s3(filename=model_state_path, key=model_name_path, creds=creds) self.trainer.load_model_locally(model_path=model_state_path) self.trainer.model.to(self.trainer.device) self.trainer.model.eval() predictions = _conduct_inference() final_predictions["grapheme"].append(predictions["grapheme"]) final_predictions["vowel"].append(predictions["vowel"]) final_predictions["consonant"].append(predictions["consonant"]) if idx == 1: final_predictions["image_id"].extend(predictions["image_id"]) pred_dictionary = _get_maximum_probs(preds=final_predictions) submission_df = _create_submission_df(pred_dict=pred_dictionary) if to_csv: timestamp = utils.generate_timestamp() output_path = f"{output_dir}/submission_{timestamp}" LOGGER.info(f'Saving submission dataframe to {output_path}') submission_df.to_csv(output_path, index=False) return submission_df
Use LSA to extract latent vectors """ import argparse import bz2 import logging from gensim.corpora import Dictionary, MmCorpus from gensim.models import LsiModel from utils import generate_timestamp logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) timestamp = generate_timestamp() parser = argparse.ArgumentParser() parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt") parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm") parser.add_argument("-m", "--model", help="path to model output") args = parser.parse_args() # load id->word mapping (the dictionary) id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary)) # load corpus iterator mm = MmCorpus(args.corpus) print(mm) # MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries)
# same as folder name. also used for cluster job names PIPELINE_NAME = "Mapping" #CONFIG CONFIG = "/home/userrig/Solexa/bcl2fastq2-v2.17/" CONFIG += "generateBCL2FASTQ2.17config.sh" #SAMPLESHEET SAMPLESHEET = "/home/userrig/Solexa/bcl2fastq2-v2.17/" SAMPLESHEET += "generateBCL2FASTQ2.17SampleSheet.sh" #BWA mapping pipeline BWA = "/home/userrig/pipelines/NewBwaMappingPipelineMem/" BWA += "generateBwa0.7.5aconfigurationV217V2.sh" #RNA mapping pipeline RNA = "/home/userrig/pipelines/NewRNAseqTophatCufflinksPipeline/" RNA += "generateTophatCufflinksconfigurationV217V2.sh" #ANALYSIS_ID analysis_id = generate_timestamp() # global logger logger = logging.getLogger(__name__) handler = logging.StreamHandler() handler.setFormatter( logging.Formatter('[{asctime}] {levelname:8s} {filename} {message}', style='{')) logger.addHandler(handler) def main(): """main function""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-1', "--break-after-first", action='store_true',
def main(): """main function""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '-r', "--runid", help="Run ID plus flowcell ID", required=True, ) parser.add_argument( '-s', "--status", help="Analysis status", required=True, choices=['STARTED', 'SUCCESS', 'FAILED', 'SEQRUNFAILED', 'NON-BCL']) parser.add_argument('-a', "--analysis-id", help="Analysis id", required=True) parser.add_argument('-o', "--out", help="Analysis output directory") parser.add_argument('-t', "--test-server", action='store_true') parser.add_argument('-n', "--dry-run", action='store_true', help="Dry run") parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if not is_production_user(): logger.warning("Not a production user. Skipping MongoDB update") sys.exit(1) user_name = "userrig" run_number = args.runid connection = mongodb_conn(args.test_server) if connection is None: sys.exit(1) logger.info("Database connection established") db = connection.gisds.runcomplete logger.debug("DB %s", db) logger.info("Status for %s is %s", run_number, args.status) if args.status in ["STARTED", "SEQRUNFAILED"]: try: if not args.dry_run: res = db.update_one({"run": run_number}, { "$push": { "analysis": { "analysis_id": args.analysis_id, "user_name": user_name, "out_dir": args.out, "Status": args.status, } } }) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format( res.modified_count)) except (pymongo.errors.OperationFailure, AssertionError) as e: logger.fatal( "MongoDB update failure while setting run %s analysis_id %s to %s", run_number, args.analysis_id, args.status) sys.exit(1) elif args.status in ["SUCCESS", "FAILED"]: end_time = generate_timestamp() logger.info("Setting timestamp to %s", end_time) try: if not args.dry_run: res = db.update_one( { "run": run_number, 'analysis.analysis_id': args.analysis_id }, { "$set": { "analysis.$": { "analysis_id": args.analysis_id, "end_time": end_time, "user_name": user_name, "out_dir": args.out, "Status": args.status, } } }) assert res.modified_count == 1, ( "Modified {} documents instead of 1".format( res.modified_count)) except (pymongo.errors.OperationFailure, AssertionError) as e: logger.fatal( "MongoDB update failure while setting run %s analysis_id %s to %s", run_number, args.analysis_id, args.status) sys.exit(1) else: raise ValueError(args.status) # close the connection to MongoDB connection.close()
def start_data_transfer(connection, mux, mux_info, site, mail_to): """ Data transfer from source to destination """ run_number, downstream_id, analysis_id, bcl_path = mux_info fastq_src = os.path.join(bcl_path, "out", "Project_"+mux) bcl_dir = os.path.basename(bcl_path) if is_devel_version(): fastq_dest = os.path.join(novogene_conf['FASTQ_DEST'][site]['devel'], \ mux, run_number, bcl_dir) yaml_dest = os.path.join(novogene_conf['FASTQ_DEST'][site]['devel'], \ mux, mux +"_multisample.yaml") else: fastq_dest = os.path.join(novogene_conf['FASTQ_DEST'][site]['production'], \ mux, run_number, bcl_dir) yaml_dest = os.path.join(novogene_conf['FASTQ_DEST'][site]['production'], \ mux, mux+ "_multisample.yaml") rsync_cmd = 'rsync -va %s %s' % (fastq_src, fastq_dest) if not os.path.exists(fastq_dest): try: os.makedirs(fastq_dest) logger.info("data transfer started for %s from %s", mux, run_number) st_time = generate_timestamp() update_downstream_mux(connection, run_number, analysis_id, downstream_id, \ "COPYING_" + st_time) _ = subprocess.check_output(rsync_cmd, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: body = "The following command failed with return code {}: {}". \ format(e.returncode, rsync_cmd) subject = "{} from {}: SG10K data transfer ({}) failed".format(mux, run_number, site) logger.fatal(body) logger.fatal("Output: %s", e.output.decode()) logger.fatal("Exiting") #Send_mail send_mail(subject, body, toaddr=mail_to, ccaddr=None) #Delete the partial info being rsync update_downstream_mux(connection, run_number, analysis_id, downstream_id, "ERROR") sys.exit(1) #Update the mongoDB for successful data transfer sample_info = get_mux_details(run_number, mux, fastq_dest) #Touch rsync complete file with open(os.path.join(fastq_dest, "rsync_complete.txt"), "w") as f: f.write("") with open(yaml_dest, 'w') as fh: yaml.dump(dict(sample_info), fh, default_flow_style=False) job = {} job['sample_cfg'] = {} for outer_key, outer_value in sample_info.items(): ctime, _ = generate_window(1) job['sample_cfg'].update({outer_key:outer_value}) job['site'] = site job['pipeline_name'] = 'custom/SG10K' job['pipeline_version'] = novogene_conf['PIPELINE_VERSION'] job['ctime'] = ctime job['requestor'] = 'userrig' if is_devel_version(): novogene_outdir = os.path.join(novogene_conf['NOVOGENE_OUTDIR'][site]['devel'], \ mux) else: novogene_outdir = os.path.join(novogene_conf['NOVOGENE_OUTDIR'][site]['production'], mux) job['out_dir_override'] = novogene_outdir logger.info("Data transfer completed successfully for %s from %s", mux, run_number) job_id = insert_muxjob(connection, mux, job) update_downstream_mux(connection, run_number, analysis_id, downstream_id, job_id) subject = "{} from {}: SG10K data transfer ({}) completed".format(mux, run_number, site) body = "Data transfer successfully completed for {} from {}".format(mux, run_number) send_mail(subject, body, toaddr=mail_to, ccaddr=None) return True else: logger.critical("Mux %s from %s directory already exists under %s", mux, \ run_number, fastq_dest) return False
""" Generate csv submission for Kaggle contest """ import argparse import logging from gensim.models import Word2Vec from utils import extract_elements, choose_answer, preprocess_for_model, generate_timestamp logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) timestamp = generate_timestamp() parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", help="path to word2vec/model/timestamp.model") parser.add_argument("-p", "--project", help="path to validation_set.tsv and submissions") args = parser.parse_args() # Load model model = Word2Vec.load(args.model, mmap='r') # Load validation set and advance 1 line validation_set = open("%s/validation_set.tsv" % args.project) validation_set.readline() output = open("%s/%s_submission.csv" % (args.project, timestamp), "w")
level=logging.INFO ) parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", help="path to word2vec/model/timestamp.model") parser.add_argument("-p", "--project", help="path to validation_set.tsv and submissions") args = parser.parse_args() # Load model model = LsiModel.load(args.model, mmap='r') # Load validation set and advance 1 line validation_set = open("%s/validation_set.tsv" % args.project) validation_set.readline() output = open("%s/%s_submission.csv" % (args.project, generate_timestamp()), "w") output.write("id,correctAnswer\n") for line in validation_set: elements = line.split("\t") question_id = elements.pop(0) # Get bag-of-words representation of question and answers doc_vectors = [model.id2word.doc2bow(element.split()) for element in elements] question = doc_vectors.pop(0) # Generate list of tuples: # (Cosine similarity, mapped index 0-3 to A-D) similarities = [(cossim(model[question], model[answer]), chr(idx + 65)) for idx, answer in enumerate(doc_vectors)] chosen_answer = max(similarities)[1]