def readfile(self, fname): if fname is None: raise Exception('can\'t read NoneType partition file') if os.stat(fname).st_size == 0: raise Exception('partition file %s has size zero' % fname) if utils.getsuffix(fname) == '.csv': with open(fname, 'r') as infile: reader = csv.DictReader(infile) if 'partition' not in reader.fieldnames: raise Exception('\'partition\' not among headers in %s, maybe this isn\'t a partition file? (if you\'re running \'view-output\' on a deprecated csv output file, you may need to run \'view-annotations\' instead, to tell it that this is an annotation file rather than a partition file)' % fname) lines = [line for line in reader] # not sure that I really need this step self.readlines(lines, process_csv=True) elif utils.getsuffix(fname) == '.yaml': utils.read_yaml_output(fname, cpath=self) else: raise Exception('unhandled annotation file suffix %s' % outfname)
def readfile(self, fname): if fname is None: raise Exception('can\'t read NoneType partition file') if os.stat(fname).st_size == 0: raise Exception('partition file %s has size zero' % fname) if utils.getsuffix(fname) == '.csv': with open(fname, 'r') as infile: reader = csv.DictReader(infile) if 'partition' not in reader.fieldnames: raise Exception( '\'partition\' not among headers in %s, maybe this isn\'t a partition file?' % fname) lines = [line for line in reader ] # not sure that I really need this step self.readlines(lines, process_csv=True) elif utils.getsuffix(fname) == '.yaml': utils.read_yaml_output(fname, cpath=self) else: raise Exception('unhandled annotation file suffix %s' % outfname)
def read_sequence_file(infname, is_data, n_max_queries=-1, args=None, simglfo=None, quiet=False, more_input_info=None): # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils) yaml_glfo = None suffix = utils.getsuffix(infname) if suffix in delimit_info: seqfile = open( infname ) # closes on function exit. no, this isn't the best way to do this reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix]) elif suffix in ['.fa', '.fasta', '.fastx']: reader = utils.read_fastx( infname, name_key='unique_ids', seq_key='input_seqs', add_info=False, sanitize=True, n_max_queries= n_max_queries, # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below queries=(args.queries if (args is not None and not args.abbreviate) else None) ) # NOTE also can't filter on args.queries here if we're also translating elif suffix == '.yaml': yaml_glfo, reader, _ = utils.read_yaml_output( infname, n_max_queries=n_max_queries, synth_single_seqs=True, dont_add_implicit_info=True ) # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m. if not is_data: simglfo = yaml_glfo # doesn't replace the contents, of course, which is why we return it else: raise Exception('unhandled file extension %s' % suffix) input_info = OrderedDict() reco_info = None if not is_data: reco_info = OrderedDict() # already_printed_forbidden_character_warning = False n_queries_added = 0 found_seed = False potential_names, used_names = None, None # for abbreviating iname = None # line number -- used as sequence id if there isn't a name column in the file iline = -1 for line in reader: iline += 1 if args is not None: if args.istartstop is not None: if iline < args.istartstop[0]: continue if iline >= args.istartstop[1]: break if args.name_column is not None: line['unique_ids'] = line[args.name_column] del line[args.name_column] if args.seq_column is not None: line['input_seqs'] = line[args.seq_column] if args.seq_column != 'seqs': # stupid god damn weird backwards compatibility edge case bullshit del line[args.seq_column] if iname is None and 'unique_ids' not in line and 'unique_id' not in line: print ' %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % ( utils.color('yellow', 'warning')) iname = 0 if iname is not None: line['unique_ids'] = '%09d' % iname iname += 1 if 'input_seqs' not in line and 'seq' not in line: raise Exception( 'couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname) if suffix != '.yaml': utils.process_input_line(line) if len(line['unique_ids']) > 1: raise Exception('can\'t yet handle multi-seq csv input files') uid = line['unique_ids'][0] if uid in input_info: new_uid = uid iid = 2 while new_uid in input_info: new_uid = uid + '-' + str(iid) iid += 1 print ' %s uid %s already read from input file %s, so replacing with new uid %s' % ( utils.color('yellow', 'warning'), uid, infname, new_uid) uid = new_uid inseq = line['input_seqs'][0] # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them # if any(fc in uid for fc in utils.forbidden_characters): # raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid)) if args is not None: if args.abbreviate: # note that this changes <uid>, but doesn't modify <line> uid, potential_names, used_names = utils.choose_new_uid( potential_names, used_names) if args.queries is not None and uid not in args.queries: continue if args.reco_ids is not None and line[ 'reco_id'] not in args.reco_ids: continue if args.seed_unique_id is not None and uid == args.seed_unique_id: found_seed = True if uid in input_info: raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname)) if any(c not in utils.alphabet for c in inseq): unexpected_chars = set( [ch for ch in inseq if ch not in utils.alphabet]) raise Exception( 'unexpected character%s %s (not among %s) in input sequence with id %s:\n %s' % (utils.plural(len(unexpected_chars)), ', '.join([ ('\'%s\'' % ch) for ch in unexpected_chars ]), utils.nukes + utils.ambiguous_bases, uid, inseq)) # da business input_info[uid] = { 'unique_ids': [ uid, ], 'seqs': [ inseq, ] } if not is_data: if 'v_gene' not in line: raise Exception('simulation info not found in %s' % infname) reco_info[uid] = copy.deepcopy(line) if simglfo is not None: utils.add_implicit_info(simglfo, reco_info[uid]) for line_key in utils.input_metafile_keys.values(): if line_key in reco_info[ uid]: # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else)) input_info[uid][line_key] = copy.deepcopy( reco_info[uid][line_key] ) # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation) n_queries_added += 1 if n_max_queries > 0 and n_queries_added >= n_max_queries: if not quiet: # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now print ' --n-max-queries: stopped after reading %d queries from input file' % len( input_info) break if more_input_info is not None: # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation) if len(set(more_input_info) & set(input_info)) > 0: print ' %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % ( utils.color('red', 'note:'), len(set(more_input_info) & set(input_info)), ' '.join(set(more_input_info) & set(input_info)) ) # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info: found_seed = True input_info.update(more_input_info) if args is not None and args.input_metafname is not None: read_input_metafo(args.input_metafname, input_info.values(), debug=True) post_process(input_info, reco_info, args, infname, found_seed, is_data, iline) if len(input_info) == 0: raise Exception('didn\'t read any sequences from %s' % infname) return input_info, reco_info, yaml_glfo