def toks_to_data_dict(self, toks): out = {} if len(toks) < len(self.columns): err_msg = 'Too few columns. Received %s. Expected %s' \ %(len(toks),len(self.columns)) raise BadFormatError(err_msg) for col_index, col_def in self.columns.items(): col_name = col_def['name'] col_type = col_def['type'] tok = toks[col_index] if tok == '': out[col_name] = None else: if col_type == 'string': out[col_name] = tok elif col_type == 'int': out[col_name] = int(tok) elif col_type == 'float': out[col_name] = float(tok) else: out[col_name] = tok return out
def run(self): """ Convert input file to a .crv file using the primary converter.""" try: self.setup() start_time = time.time() self.logger.info('Conversion start: %s' % \ time.asctime(time.localtime(start_time))) self.primary_converter.setup(self.f) self.f.seek(0) read_lnum = 0 write_lnum = 0 num_errors = 0 for l in self.f: read_lnum += 1 try: # all_wdicts is a list, since one input line can become # multiple output lines all_wdicts = self.primary_converter.convert_line(l) if all_wdicts is None: continue except Exception as e: num_errors += 1 self._log_conversion_error(read_lnum, e) continue if all_wdicts: UIDMap = [] for wdict in all_wdicts: if wdict['ref_base'] == '' \ and wdict['alt_base'] not in ['A','T','C','G']: num_errors += 1 e = BadFormatError( 'Reference base required for non SNV') self._log_conversion_error(read_lnum, e) continue if self.do_liftover: prelift_wdict = copy.copy(wdict) try: wdict['chrom'], wdict['pos'] = self.liftover( wdict['chrom'], wdict['pos']) except LiftoverFailure as e: num_errors += 1 self._log_conversion_error(read_lnum, e) continue unique, UID = self.vtracker.addVar( wdict['chrom'], int(wdict['pos']), wdict['ref_base'], wdict['alt_base']) wdict['uid'] = UID if unique: write_lnum += 1 self.crv_writer.write_data(wdict) if self.do_liftover: prelift_wdict['uid'] = UID self.crl_writer.write_data(prelift_wdict) if UID not in UIDMap: #For this input line, only write to the .crm if the UID has not yet been written to the map file. self.crm_writer.write_data({ 'original_line': read_lnum, 'tags': wdict['tags'], 'uid': UID }) UIDMap.append(UID) self.crs_writer.write_data(wdict) end_time = time.time() self.logger.info('Conversion end: %s' %\ time.asctime(time.localtime(end_time))) self.logger.info('Read lines: %d' % read_lnum) self.logger.info('Error lines: %d' % num_errors) self.logger.info('Wrote lines: %d' % write_lnum) runtime = round(end_time - start_time, 3) self.logger.info('Conversion runtime: %s' % runtime) self._close_files() except Exception as e: self.__handle_exception(e)
def run(self): """ Convert input file to a .crv file using the primary converter.""" self.setup() start_time = time.time() self.status_writer.queue_status_update( "status", "Started {} ({})".format("Converter", self.primary_converter.format_name), ) last_status_update_time = time.time() multiple_files = len(self.input_paths) > 1 fileno = 0 total_lnum = 0 base_re = re.compile("^[ATGC]+|[-]+$") write_lnum = 0 for fn in self.input_paths: if self.pipeinput: f = sys.stdin else: f = self.open_input_file(fn) if self.pipeinput == True: fname = STDIN else: fname = f.name fileno += 1 converter = self.primary_converter.__class__() self._set_converter_properties(converter) converter.setup(f) if self.pipeinput == False: f.seek(0) read_lnum = 0 num_errors = 0 if self.pipeinput: cur_fname = STDIN else: cur_fname = os.path.basename(f.name) for l in f: samp_prefix = cur_fname read_lnum += 1 try: # all_wdicts is a list, since one input line can become # multiple output lines. False is returned if converter # decides line is not an input line. all_wdicts = converter.convert_line(l) if all_wdicts is BaseConverter.IGNORE: continue total_lnum += 1 if all_wdicts: UIDMap = [] no_unique_var = 0 for wdict_no in range(len(all_wdicts)): wdict = all_wdicts[wdict_no] chrom = wdict["chrom"] pos = wdict["pos"] if chrom is not None: if not chrom.startswith("chr"): chrom = "chr" + chrom wdict["chrom"] = self.chromdict.get( chrom, chrom) if multiple_files: if wdict["sample_id"]: wdict["sample_id"] = "__".join( [samp_prefix, wdict["sample_id"]]) else: wdict["sample_id"] = samp_prefix if "ref_base" not in wdict or wdict[ "ref_base"] == "": wdict[ "ref_base"] = self.wgsreader.get_bases( chrom, int(wdict["pos"])) else: ref_base = wdict["ref_base"] if ref_base == "" and wdict[ "alt_base"] not in [ "A", "T", "C", "G", ]: raise BadFormatError( "Reference base required for non SNV" ) elif ref_base is None or ref_base == "": wdict[ "ref_base"] = self.wgsreader.get_bases( chrom, int(pos)) prelift_wdict = copy.copy(wdict) if self.do_liftover: ( wdict["chrom"], wdict["pos"], wdict["ref_base"], wdict["alt_base"], ) = self.liftover( wdict["chrom"], int(wdict["pos"]), wdict["ref_base"], wdict["alt_base"], ) if base_re.fullmatch( wdict["ref_base"]) is None: raise BadFormatError( "Invalid reference base") if base_re.fullmatch( wdict["alt_base"]) is None: raise BadFormatError( "Invalid alternate base") p, r, a = ( int(wdict["pos"]), wdict["ref_base"], wdict["alt_base"], ) ( new_pos, new_ref, new_alt, ) = self.standardize_pos_ref_alt("+", p, r, a) wdict["pos"] = new_pos wdict["ref_base"] = new_ref wdict["alt_base"] = new_alt unique, UID = self.vtracker.addVar( wdict["chrom"], new_pos, new_ref, new_alt) wdict["uid"] = UID if wdict["ref_base"] == wdict["alt_base"]: raise NoVariantError() if unique: write_lnum += 1 self.crv_writer.write_data(wdict) #if self.do_liftover: #if wdict["pos"] != prelift_wdict["pos"] or wdict["ref_base"] != prelift_wdict["ref_base"] or wdict["alt_base"] != prelift_wdict["alt_base"]: prelift_wdict["uid"] = UID self.crl_writer.write_data(prelift_wdict) # addl_operation errors shouldnt prevent variant from writing try: converter.addl_operation_for_unique_variant( wdict, no_unique_var) except Exception as e: self._log_conversion_error( read_lnum, l, e) no_unique_var += 1 if UID not in UIDMap: # For this input line, only write to the .crm if the UID has not yet been written to the map file. self.crm_writer.write_data({ "original_line": read_lnum, "tags": wdict["tags"], "uid": UID, "fileno": self.input_path_dict2[fname], }) UIDMap.append(UID) self.crs_writer.write_data(wdict) else: raise ExpectedException( "No valid alternate allele was found in any samples." ) except Exception as e: num_errors += 1 self._log_conversion_error(read_lnum, l, e) continue f.close() cur_time = time.time() if total_lnum % 10000 == 0 or cur_time - last_status_update_time > 3: self.status_writer.queue_status_update( "status", "Running {} ({}): line {}".format("Converter", cur_fname, read_lnum), ) last_status_update_time = cur_time self.logger.info("error lines: %d" % num_errors) self._close_files() self.end() if self.status_writer is not None: self.status_writer.queue_status_update("num_input_var", total_lnum) self.status_writer.queue_status_update("num_unique_var", write_lnum) self.status_writer.queue_status_update("num_error_input", num_errors) end_time = time.time() self.logger.info("finished: %s" % time.asctime(time.localtime(end_time))) runtime = round(end_time - start_time, 3) self.logger.info("num input lines: {}".format(total_lnum)) self.logger.info("runtime: %s" % runtime) self.status_writer.queue_status_update( "status", "Finished {} ({})".format("Converter", self.primary_converter.format_name), ) return total_lnum, self.primary_converter.format_name
def run(self): """ Convert input file to a .crv file using the primary converter.""" self.setup() start_time = time.time() self.status_writer.queue_status_update( 'status', 'Started {} ({})'.format('Converter', self.primary_converter.format_name)) last_status_update_time = time.time() multiple_files = len(self.input_files) > 1 fileno = 0 total_lnum = 0 for f in self.input_files: fileno += 1 self.primary_converter.setup(f) f.seek(0) read_lnum = 0 write_lnum = 0 num_errors = 0 for l in f: cur_fname = os.path.basename(f.name) samp_prefix = cur_fname read_lnum += 1 try: # all_wdicts is a list, since one input line can become # multiple output lines. False is returned if converter # decides line is not an input line. all_wdicts = self.primary_converter.convert_line(l) if all_wdicts is BaseConverter.IGNORE: continue total_lnum += 1 except Exception as e: num_errors += 1 self._log_conversion_error(read_lnum, l, e) continue if all_wdicts: UIDMap = [] no_unique_var = 0 for wdict_no in range(len(all_wdicts)): wdict = all_wdicts[wdict_no] chrom = wdict['chrom'] if chrom is not None: if not chrom.startswith('chr'): chrom = 'chr' + chrom wdict['chrom'] = self.chromdict.get(chrom, chrom) if multiple_files: if wdict['sample_id']: wdict['sample_id'] = '__'.join( [samp_prefix, wdict['sample_id']]) else: wdict['sample_id'] = samp_prefix if wdict['ref_base'] == '' and wdict[ 'alt_base'] not in ['A', 'T', 'C', 'G']: num_errors += 1 e = BadFormatError( 'Reference base required for non SNV') self._log_conversion_error(read_lnum, l, e) continue if self.do_liftover: prelift_wdict = copy.copy(wdict) try: wdict['chrom'], wdict[ 'pos'] = self.liftover( wdict['chrom'], wdict['pos']) except LiftoverFailure as e: num_errors += 1 self._log_conversion_error(read_lnum, l, e) continue p, r, a = int( wdict['pos'] ), wdict['ref_base'], wdict['alt_base'] new_pos, new_ref, new_alt = self.standardize_pos_ref_alt( '+', p, r, a) unique, UID = self.vtracker.addVar( wdict['chrom'], new_pos, new_ref, new_alt) wdict['uid'] = UID if unique: write_lnum += 1 self.crv_writer.write_data(wdict) if self.do_liftover: prelift_wdict['uid'] = UID self.crl_writer.write_data(prelift_wdict) self.primary_converter.addl_operation_for_unique_variant( wdict, no_unique_var) no_unique_var += 1 if UID not in UIDMap: #For this input line, only write to the .crm if the UID has not yet been written to the map file. self.crm_writer.write_data({ 'original_line': read_lnum, 'tags': wdict['tags'], 'uid': UID, 'fileno': self.input_path_dict2[f.name] }) UIDMap.append(UID) self.crs_writer.write_data(wdict) else: e = ExpectedException('No conversion result') self._log_conversion_error(read_lnum, l, e) cur_time = time.time() if total_lnum % 10000 == 0 or cur_time - last_status_update_time > 3: self.status_writer.queue_status_update( 'status', 'Running {} ({}): line {}'.format('Converter', cur_fname, read_lnum)) last_status_update_time = cur_time self.logger.info('error lines: %d' % num_errors) self._close_files() self.end() if self.status_writer is not None: self.status_writer.queue_status_update('num_input_var', total_lnum) self.status_writer.queue_status_update('num_unique_var', write_lnum) self.status_writer.queue_status_update('num_error_input', num_errors) end_time = time.time() self.logger.info('finished: %s' %\ time.asctime(time.localtime(end_time))) runtime = round(end_time - start_time, 3) self.logger.info('num input lines: {}'.format(total_lnum)) self.logger.info('runtime: %s' % runtime) self.status_writer.queue_status_update( 'status', 'Finished {} ({})'.format('Converter', self.primary_converter.format_name)) return total_lnum, self.primary_converter.format_name
def run(self): """ Convert input file to a .crv file using the primary converter.""" self.setup() start_time = time.time() multiple_files = len(self.input_files) > 1 for f in self.input_files: self.primary_converter.setup(f) f.seek(0) read_lnum = 0 write_lnum = 0 num_errors = 0 for l in f: cur_fname = os.path.basename(f.name) samp_prefix = '.'.join(cur_fname.split('.')[:-1]) read_lnum += 1 try: # all_wdicts is a list, since one input line can become # multiple output lines all_wdicts = self.primary_converter.convert_line(l) if all_wdicts is None: continue except Exception as e: num_errors += 1 self._log_conversion_error(read_lnum, l, e) continue if all_wdicts: UIDMap = [] for wdict in all_wdicts: chrom = wdict['chrom'] if not chrom.startswith('chr'): chrom = 'chr' + chrom wdict['chrom'] = self.chromdict.get(chrom, chrom) if multiple_files: wdict['sample_id'] = '_'.join([samp_prefix, wdict['sample_id']]) if wdict['ref_base'] == '' and wdict['alt_base'] not in ['A','T','C','G']: num_errors += 1 e = BadFormatError('Reference base required for non SNV') self._log_conversion_error(read_lnum, l, e) continue if self.do_liftover: prelift_wdict = copy.copy(wdict) try: wdict['chrom'], wdict['pos'] = self.liftover(wdict['chrom'], wdict['pos']) except LiftoverFailure as e: num_errors += 1 self._log_conversion_error(read_lnum, l, e) continue unique, UID = self.vtracker.addVar(wdict['chrom'], int(wdict['pos']), wdict['ref_base'], wdict['alt_base']) wdict['uid'] = UID if unique: write_lnum += 1 self.crv_writer.write_data(wdict) if self.do_liftover: prelift_wdict['uid'] = UID self.crl_writer.write_data(prelift_wdict) if UID not in UIDMap: #For this input line, only write to the .crm if the UID has not yet been written to the map file. self.crm_writer.write_data({'original_line': read_lnum, 'tags': wdict['tags'], 'uid': UID}) UIDMap.append(UID) self.crs_writer.write_data(wdict) self.logger.info('error lines: %d' %num_errors) self._close_files() if self.status_writer is not None: self.status_writer.queue_status_update('num_input_var', read_lnum) self.status_writer.queue_status_update('num_unique_var', write_lnum) self.status_writer.queue_status_update('num_error_input', num_errors) end_time = time.time() self.logger.info('finished: %s' %\ time.asctime(time.localtime(end_time))) runtime = round(end_time - start_time, 3) self.logger.info('num input lines: {}'.format(read_lnum)) self.logger.info('runtime: %s'%runtime)