def open_input_file(self, input_path): encoding = detect_encoding(input_path) if input_path.endswith(".gz"): f = gzip.open(input_path, mode="rt", encoding=encoding) else: f = open(input_path, encoding=encoding) return f
def _loop_definition(self): encoding = detect_encoding(self.path) #sys.stderr.write('loop definition [' + self.path + ']. encoding=' + str(encoding) + '\n') f = open(self.path, encoding=encoding) for l in f: l = l.rstrip().lstrip() if l.startswith('#'): yield l else: break f.close()
def first_input_file(self): if self.pipeinput == False: input_path = self.input_paths[0] encoding = detect_encoding(input_path) if input_path.endswith(".gz"): f = gzip.open(input_path, mode="rt", encoding=encoding) else: f = open(input_path, encoding=encoding) else: f = sys.stdin return f
def __init__(self, path, seekpos=None, chunksize=None): super().__init__(path) self.seekpos = seekpos self.chunksize = chunksize self.encoding = detect_encoding(self.path) self.annotator_name = '' self.annotator_displayname = '' self.annotator_version = '' self.no_aggregate_cols = [] self.index_columns = [] self.report_substitution = None self._setup_definition()
def _loop_data(self): encoding = detect_encoding(self.path) #sys.stderr.write('loop data [' + self.path + ']. encoding=' + str(encoding) + '\n') f = open(self.path, 'rb') lnum = 0 for l in f: l = l.decode(encoding) lnum += 1 l = l.rstrip('\r\n') if l.startswith('#'): continue else: yield lnum, l f.close()
def setup(self): """ Do necesarry pre-run tasks """ if self.ready_to_convert: return # Open file handle to input path for input_path in self.input_paths: encoding = detect_encoding(input_path) if input_path.endswith('.gz'): f = gzip.open(input_path, mode='rt', encoding=encoding) else: f = open(input_path, encoding=encoding) self.input_files.append(f) # Read in the available converters self._initialize_converters() # Select the converter that matches the input format self._select_primary_converter() # Open the output files self._open_output_files() self.ready_to_convert = True
def setup (self): """ Do necesarry pre-run tasks """ if self.ready_to_convert: return # Open file handle to input path for input_path in self.input_paths: encoding = detect_encoding(input_path) self.input_files.append(open(input_path, encoding=encoding)) # Read in the available converters self._initialize_converters() # Select the converter that matches the input format self._select_primary_converter() # A correct .crv file is not processed. #todo handle this for multiple inputs. have to convert them so they can be merged # if self.input_format == 'crv' and \ # self.input_paths[0].split('.')[-1] == 'crv': # self.logger.info('Input file is already a crv file. Exiting converter.') # exit(0) # Open the output files self._open_output_files() self.ready_to_convert = True
def write_preface(self, level): self.level = level if self.wf is not None: self.wf.close() if level != 'variant': return self.wf = open(self.filename, 'w', encoding='utf-8', newline='') lines = [ '#fileformat=VCFv4.2', '#OpenCRAVATFileDate=' + datetime.datetime.now().strftime('%Y%m%d'), ] self.write_preface_lines(lines) self.vcflines = {} self.input_path_dict = {} if self.input_format == 'vcf': if self.args.inputfiles is not None: if type(self.args.inputfiles) is str: self.args.inputfiles = [self.args.inputfiles] for i in range(len(self.args.inputfiles)): self.input_path_dict[self.args.inputfiles[i]] = i written_headers = [] self.samples = [] num_inputfiles = len(self.args.inputfiles) for inputfile in self.args.inputfiles: inputfile_prefix = os.path.basename(inputfile).split( '.')[0] input_path_no = self.input_path_dict[inputfile] encoding = detect_encoding(inputfile) if inputfile.endswith('.gz'): import gzip f = gzip.open(inputfile, 'rt', encoding=encoding) else: f = open(inputfile) lineno = 0 self.vcflines[input_path_no] = {} for line in f: lineno += 1 if line.startswith('##fileformat='): continue if line.startswith('##'): if not line in written_headers: self.wf.write(line) written_headers.append(line) elif line.startswith('#CHROM'): toks = line[:-1].split('\t') if len(toks) >= 10: if num_inputfiles == 1: self.samples.extend([v for v in toks[9:]]) else: self.samples.extend([ inputfile_prefix + '_' + v for v in toks[9:] ]) elif line.startswith('#') == False: self.vcflines[input_path_no][lineno] = line.rstrip( '\n').rstrip('\r') f.close() else: self.cursor2.execute( 'select distinct(base__sample_id) from sample') self.samples = [] rows = self.cursor2.fetchall() if rows is None or len(rows) == 0: self.samples.append('NOSAMPLEID') else: for row in rows: v = row[0] if v is None: v = 'NOSAMPLEID' self.samples.append(v)