def iterate_chunks(fw): cmtreg = re.compile(r'>[^>]*?\n') headpos = 0 while True: chunk = fw.read(2**20) if not chunk: break for mat in cmtreg.finditer(chunk): if mat.start() != headpos: seq = chunk[headpos:mat.start()] yield remove_whitespaces(seq) yield mat.group().strip() headpos = mat.end() seq = chunk[headpos:] yield remove_whitespaces(seq)
def read(infile): try: fw = streaming.FileWrapper(infile, "r") except IOError: respath = locate_submat(infile.lower()) fw = streaming.FileWrapper(respath, "r") with fw: fw_lines = (l.strip() for l in fw.file) ichars = [] jstring = None jsize = 0 scores = [] for line in fw_lines: if line and not line.startswith("#"): jstring = remove_whitespaces(line) jsize = len(jstring) + 1 break if jstring is None: raise ValueError("this sub matrix file is broken") for line in fw.file: items = line.split() if not items: continue if len(items) != jsize: raise ValueError("this sub matrix file is broken") ichars.append(items[0]) scores.append([int(s) for s in items[1:]]) istring = "".join(ichars) submatr = np.array(scores, dtype=int) return istring, jstring, submatr