def get_g4s_as_bed(self, seq, seq_id='unknown', use_bed12=True): ''' query a sequence for G4s using G4Regex. Pass a seq_id to get fully formatted bed records. Predicted loops/tetrad positional information can be retained using bed12 format. ''' for strand in '+-': for r in self._regex[strand]: for m in regex.finditer(r, seq, overlapped=True, *self._regex_flags): if use_bed12: yield self._format_bed12(m, seq_id, strand) else: yield self._format_bed6(m, seq_id, strand) # clear re cache to save memory regex.purge()
def purge(): """Purge caches.""" _purge_cache() _regex.purge()
def consumer(ErrorBundle, currCount, TotalErrors): printProgress(TotalErrors, currCount) RIGHTCONTEXT = 1 LEFTCONTEXT = 0 global _MostProbablePos, _CurrentCorpus if not _CurrentCorpus: _CurrentCorpus = parallel_corpora.getCorpora('clean') getFullMatch = lambda matchObject, name: "".join( matchObject.captures(name)) OCRError, nextOCRError = ErrorBundle if _MostProbablePos is None: _MostProbablePos = CD_avg() + OCRError.Position if OCRError in splitErrors_found._getvalue(): _MostProbablePos = CD_avg() + nextOCRError.Position splitErrors_found.remove(OCRError) return None Error, OCRContext = OCRError.Error, OCRError.Context Left = _CurrentCorpus.find(OCRContext[0].lstrip()) Right = _CurrentCorpus.find(OCRContext[1].rstrip()) SEARCHPOS = None if Left != -1 and Right != -1: if 0 < (Right - Left) < 88: SEARCHPOS = int((Left + Right) / 2) DO_REG_SEARCH = True if nextOCRError.ID != ErrorContext.LAST_ERROR and \ abs( OCRError.Position - nextOCRError.Position ) <= len( Error ) + 2: start = OCRContext[RIGHTCONTEXT].find(nextOCRError.Error[0]) ErrorSR = Error + OCRContext[ RIGHTCONTEXT][:start] + nextOCRError.Error dictMatch = regex.search( r"(?P<match>^" + Error + nextOCRError.Error + r"$){1i<=1}", kwargs.get('WordList'), regex.MULTILINE) if dictMatch: with kwargs.get("splitErrors_lock"): splitErrors_found.add(nextOCRError) X = OCRalignment(dictMatch.group('match'), ErrorSR, (OCRContext[0], nextOCRError.Context[1]), ("", ""), OCRError.ID, 'SPLITDICT') _MostProbablePos = CD_avg() + nextOCRError.Position if _DEBUG: with kwargs.get("stream_lock"): with open('printedData/Enchant/matched.txt', 'a') as Aout: Aout.write(str(X) + '\n') return X else: LeftS = _CurrentCorpus.find(OCRContext[0].lstrip()) RightS = _CurrentCorpus.find(nextOCRError.Context[1].rstrip()) SPLITSEARCHPOS = None if LeftS != -1 and RightS != -1: if 0 < (RightS - LeftS) < 98: SPLITSEARCHPOS = int((LeftS + RightS) / 2) rxSplitErrPattern = constructSplitErrorRegex( Error, OCRContext, nextOCRError) split_search_itr = 0 split_search_max = 250 splitMatches = [] countUpdate = False while split_search_itr < split_search_max: corporaSliceObj = parallel_corpora.getCorporaSlice( "clean", (SPLITSEARCHPOS if SPLITSEARCHPOS is not None else int(_MostProbablePos)), split_search_itr) if not corporaSliceObj['slice'] and split_search_itr > 4: break matchObjSplitErr = rxSplitErrPattern.search( corporaSliceObj['slice']) split_search_itr += 1 if not matchObjSplitErr: if not split_search_itr % 13: regex.purge() gc.collect() else: matchObjSplitErr.detach_string() splitMatches.append( (matchObjSplitErr, corporaSliceObj)) if sum(matchObjSplitErr.fuzzy_counts, 0) < 4: break if not countUpdate: if SPLITSEARCHPOS is not None: split_search_max = split_search_itr else: split_search_max = split_search_itr + 14 countUpdate = True if splitMatches: editDists = list( map(lambda arg: sum(arg[0].fuzzy_counts, 0), splitMatches)) correctMatch, SliceObj = splitMatches[editDists.index( min(editDists))] intendedWrdSR = getFullMatch(correctMatch, 'errorMatch') clean_context = (SliceObj[ 'slice' ][ correctMatch.spans( 'errorMatch' )[ 0 ][ 0 ] - 25: \ correctMatch.spans( 'errorMatch' )[ 0 ][ 0 ] ], SliceObj[ 'slice' ][ correctMatch.spans( 'errorMatch' )[ -1 ][ 1 ]: \ correctMatch.spans( 'errorMatch' )[ -1 ][ 1 ] + 21 ]) CurrMatchPos = _relativeToActual( int(.5 * sum(correctMatch.span())), SliceObj['data']) offset = CurrMatchPos - OCRError.Position _CleanDirtyDiffs.append(offset) _MostProbablePos = CD_avg() + nextOCRError.Position with kwargs.get("splitErrors_lock"): splitErrors_found.add(nextOCRError) X = OCRalignment(intendedWrdSR, ErrorSR, (OCRContext[0], nextOCRError.Context[1]), clean_context, OCRError.ID, 'SPLIT') if _DEBUG: with kwargs.get("stream_lock"): with open('printedData/Enchant/matched.txt', 'a') as Aout: Aout.write(str(X) + '\n') return X else: DO_REG_SEARCH = False else: splitL = OCRContext[0].split() if splitL: splitL = splitL[-1] else: splitL = None splitR = OCRContext[1].split() if splitR: splitR = splitR[0] else: splitR = None if splitR and splitR not in string.punctuation: _splitR = splitR if splitR[ -1] not in string.punctuation else splitR[:-1] _splitR = r"(?P<match>^" + Error + regex.escape( _splitR) + r"$)" dictMatchR = (regex.search(_splitR, kwargs.get('WordList'), regex.MULTILINE) if _splitR else None) if dictMatchR: if OCRContext[RIGHTCONTEXT][0] == ' ': SEP = ' ' else: SEP = '' X = OCRalignment(dictMatchR.group('match'), Error + SEP + splitR, (OCRContext[0], OCRContext[1]), ('', ''), OCRError.ID, 'SPLITDICT-R') _MostProbablePos = CD_avg() + nextOCRError.Position if _DEBUG: with kwargs.get("stream_lock"): with open('printedData/Enchant/matched.txt', 'a') as Aout: Aout.write(str(X) + '\n') return X if splitL and splitL not in string.punctuation: _splitL = splitL if splitL[ 0] not in string.punctuation else splitL[1:] _splitL = r"(?P<match>^" + regex.escape( _splitL) + Error + r"$)" dictMatchL = (regex.search(_splitL, kwargs.get('WordList'), regex.MULTILINE) if _splitL else None) if dictMatchL: if OCRContext[LEFTCONTEXT][-1] == ' ': SEP = ' ' else: SEP = '' X = OCRalignment(dictMatchL.group('match'), splitL + SEP + Error, (OCRContext[0], OCRContext[1]), ('', ''), OCRError.ID, 'SPLITDICT-L') _MostProbablePos = CD_avg() + nextOCRError.Position if _DEBUG: with kwargs.get("stream_lock"): with open('printedData/Enchant/matched.txt', 'a') as Aout: Aout.write(str(X) + '\n') return X if DO_REG_SEARCH: rx = constructErrorRegex(Error, OCRContext) iteration = 0 max_iteration = 250 matches = [] countUpdate2 = False while iteration < max_iteration: corporaSliceObj = parallel_corpora.getCorporaSlice( "clean", (SEARCHPOS if SEARCHPOS is not None else int(_MostProbablePos)), iteration) if not corporaSliceObj['slice'] and iteration > 4: break matchObj = rx.search(corporaSliceObj['slice']) iteration += 1 if not matchObj: if not iteration % 13: regex.purge() gc.collect() else: matchObj.detach_string() matches.append((matchObj, corporaSliceObj)) if sum(matchObj.fuzzy_counts, 0) < 4: break if not countUpdate2: if SEARCHPOS is not None: max_iteration = iteration else: max_iteration = iteration + 14 countUpdate2 = True if matches: editDists = list( map(lambda arg: sum(arg[0].fuzzy_counts, 0), matches)) correctMatch, SliceObj = matches[editDists.index( min(editDists))] intendedWord = getFullMatch(correctMatch, 'errorMatch') clean_context = (SliceObj[ 'slice' ][ correctMatch.spans( 'errorMatch' )[ 0 ][ 0 ] - 25: \ correctMatch.spans( 'errorMatch' )[ 0 ][ 0 ] ], SliceObj[ 'slice' ][ correctMatch.spans( 'errorMatch' )[ -1 ][ 1 ]: \ correctMatch.spans( 'errorMatch' )[ -1 ][ 1 ] + 21 ]) CurrMatchPos = _relativeToActual( int(.5 * sum(correctMatch.span())), SliceObj['data']) offset = CurrMatchPos - OCRError.Position _CleanDirtyDiffs.append(offset) _MostProbablePos = CD_avg() + nextOCRError.Position X = OCRalignment(intendedWord, Error, OCRContext, clean_context, OCRError.ID, 'REG') if _DEBUG: with kwargs.get("stream_lock"): with open('printedData/Enchant/matched.txt', 'a') as Aout: Aout.write(str(X) + '\n') return X with kwargs.get("stream2_lock"): with open('printedData/Enchant/unmatched.txt', 'a') as Uout: Uout.write(str(OCRError) + "," + str(_MostProbablePos) + '\n') _MostProbablePos = CD_avg() + nextOCRError.Position return None