def tabfile_feeder( datafile, header=1, sep='\t', includefn=None, # coerce_unicode=True, # no need here because importing unicode_literals at the top assert_column_no=None): '''a generator for each row in the file.''' with open_anyfile(datafile) as in_f: reader = csv.reader(in_f, delimiter=sep) lineno = 0 try: for i in range(header): reader.next() lineno += 1 for ld in reader: if assert_column_no: if len(ld) != assert_column_no: err = "Unexpected column number:" \ " got {}, should be {}".format(len(ld), assert_column_no) raise ValueError(err) if not includefn or includefn(ld): lineno += 1 # if coerce_unicode: # yield [unicode(x, encoding='utf-8', errors='replace') for x in ld] # else: # yield ld yield ld except ValueError: print("Error at line number:", lineno) raise
def get_genome_in_bit(chr_fa_folder): ''' encode each chromosome fasta sequence into a bitarray, and store them in a dictionary with chr numbers as keys chr_fa_folder is the folder to put all gzipped fasta files: fasta files can be downloaded from NCBI FTP site: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/ chr<i>.fa.gz (e.g. chr1.fa.gz) ''' chr_bit_d = {} chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'] t0 = time.time() for i in chr_range: t1 = time.time() #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i) file_name = 'chr{}.fa.gz'.format(i) print("Loading {}...".format(file_name), end='') file_name = os.path.join(chr_fa_folder, file_name) with open_anyfile(file_name) as seq_f: seq_f.readline() # skip header seq_bit = bitarray() for line in seq_f: line = line.rstrip('\n') line_bit = nuc_to_bit(line) seq_bit += line_bit chr_bit_d.update({i: seq_bit}) print("done.[{}]".format(timesofar(t1))) print('=' * 20) print("Finished. [{}]".format(timesofar(t0))) return chr_bit_d
def tabfile_feeder(datafile, header=1, sep='\t', includefn=None, # coerce_unicode=True, # no need here because importing unicode_literals at the top assert_column_no=None): '''a generator for each row in the file.''' with open_anyfile(datafile) as in_f: reader = csv.reader(in_f, delimiter=sep) lineno = 0 try: for i in range(header): reader.next() lineno += 1 for ld in reader: if assert_column_no: if len(ld) != assert_column_no: err = "Unexpected column number:" \ " got {}, should be {}".format(len(ld), assert_column_no) raise ValueError(err) if not includefn or includefn(ld): lineno += 1 # if coerce_unicode: # yield [unicode(x, encoding='utf-8', errors='replace') for x in ld] # else: # yield ld yield ld except ValueError: print("Error at line number:", lineno) raise
def get_genome_in_bit(chr_fa_folder): ''' encode each chromosome fasta sequence into a bitarray, and store them in a dictionary with chr numbers as keys chr_fa_folder is the folder to put all gzipped fasta files: fasta files can be downloaded from NCBI FTP site: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/ chr<i>.fa.gz (e.g. chr1.fa.gz) ''' chr_bit_d = {} chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'] t0 = time.time() for i in chr_range: t1 = time.time() #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i) file_name = 'chr{}.fa.gz'.format(i) print("Loading {}...".format(file_name), end='') file_name = os.path.join(chr_fa_folder, file_name) with open_anyfile(file_name) as seq_f: seq_f.readline() # skip header seq_bit = bitarray() for line in seq_f: line = line.rstrip('\n') line_bit = nuc_to_bit(line) seq_bit += line_bit chr_bit_d.update({i: seq_bit}) print("done.[{}]".format(timesofar(t1))) print('='*20) print("Finished. [{}]".format(timesofar(t0))) return chr_bit_d
def rec_handler(infile, block_end='\n', skip=0, include_block_end=False, as_list=False): '''A generator to return a record (block of text) at once from the infile. The record is separated by one or more empty lines by default. skip can be used to skip top n-th lines if include_block_end is True, the line matching block_end will also be returned. if as_list is True, return a list of lines in one record. ''' rec_separator = lambda line: line == block_end with open_anyfile(infile) as in_f: if skip: for i in range(skip): in_f.readline() for key, group in itertools.groupby(in_f, rec_separator): if not key: if include_block_end: _g = itertools.chain(group, (block_end,)) yield (list(_g) if as_list else ''.join(_g))
def rec_handler(infile, block_end='\n', skip=0, include_block_end=False, as_list=False): '''A generator to return a record (block of text) at once from the infile. The record is separated by one or more empty lines by default. skip can be used to skip top n-th lines if include_block_end is True, the line matching block_end will also be returned. if as_list is True, return a list of lines in one record. ''' rec_separator = lambda line: line == block_end with open_anyfile(infile) as in_f: if skip: for i in range(skip): in_f.readline() for key, group in itertools.groupby(in_f, rec_separator): if not key: if include_block_end: _g = itertools.chain(group, (block_end, )) yield (list(_g) if as_list else ''.join(_g))