def split_match(fragments, start=0, end=-1): n = len(fragments) # print(f"GALVEZ: n={n}") # import sys; sys.stdout.flush() if n < 1: return elif n == 1: weighted_fragments = [(0, fragments[0])] else: # so we later know the original index of each fragment weighted_fragments = enumerate(fragments) # assigns high values to long statements near the center of the list weighted_fragments = enweight(weighted_fragments) weighted_fragments = map( lambda fw: (fw[0], (1 - fw[1]) * len(fw[0][1]["transcript"])), weighted_fragments, ) # fragments with highest weights first weighted_fragments = sorted(weighted_fragments, key=lambda fw: fw[1], reverse=True) # strip weights weighted_fragments = list(map(lambda fw: fw[0], weighted_fragments)) for index, fragment in weighted_fragments: # find_best? match = search.find_best(fragment["transcript"], start=start, end=end) match_start, match_end, sws_score, match_substitutions = match # At least half must overlap... # print(f"GALVEZ: sws_score={sws_score}") # import sys; sys.stdout.flush() # Maybe what I need to do is require this score to be higher? # The problem is that I don't know how to decrease this... # If score > n / (2n). So basically >0.5, right? if sws_score > (n - 1) / (2 * n): # print(f"GALVEZ: sws passed sws_score={sws_score}") # import sys; sys.stdout.flush() fragment["match-start"] = match_start fragment["match-end"] = match_end fragment["sws"] = sws_score fragment["substitutions"] = match_substitutions # Here's the recursive joining, is that right? # What does this do? for f in split_match(fragments[0:index], start=start, end=match_start): yield f yield fragment for f in split_match(fragments[index + 1:], start=match_end, end=end): yield f return for _, _ in weighted_fragments: yield None
def weighted_ngrams(s, size, direction=0): """ Lists all appearances of all N-grams of a string from left to right together with a positional weight value. The positional weight progresses quadratically. :param s: String to decompose :param size: N-gram size :param direction: Order of assigning positional weights to N-grams: direction < 0: Weight of first N-gram is 1.0 and of last one 0.0 direction > 0: Weight of first N-gram is 0.0 and of last one 1.0 direction == 0: Weight of center N-gram(s) near or equal 0, weight of first and last N-gram 1.0 :return: Produces (string, float) tuples representing the N-gram along with its assigned positional weight value """ return enweight(ngrams(s, size), direction=direction)
def split_match(fragments, start=0, end=-1): n = len(fragments) if n < 1: return elif n == 1: weighted_fragments = [(0, fragments[0])] else: # so we later know the original index of each fragment weighted_fragments = enumerate(fragments) # assigns high values to long statements near the center of the list weighted_fragments = enweight(weighted_fragments) weighted_fragments = map( lambda fw: (fw[0], (1 - fw[1]) * len(fw[0][1]["transcript"])), weighted_fragments, ) # fragments with highest weights first weighted_fragments = sorted(weighted_fragments, key=lambda fw: fw[1], reverse=True) # strip weights weighted_fragments = list(map(lambda fw: fw[0], weighted_fragments)) for index, fragment in weighted_fragments: match = search.find_best(fragment["transcript"], start=start, end=end) match_start, match_end, sws_score, match_substitutions = match if sws_score > (n - 1) / (2 * n): fragment["match-start"] = match_start fragment["match-end"] = match_end fragment["sws"] = sws_score fragment["substitutions"] = match_substitutions # Here's the recursive joining, is that right? # This is an exponential time algorithm? for f in split_match(fragments[0:index], start=start, end=match_start): yield f yield fragment for f in split_match(fragments[index + 1:], start=match_end, end=end): yield f return for _, _ in weighted_fragments: yield None
def split_match(fragments, start=0, end=-1): n = len(fragments) if n < 1: raise StopIteration elif n == 1: weighted_fragments = [(0, fragments[0])] else: # so we later know the original index of each fragment weighted_fragments = enumerate(fragments) # assigns high values to long statements near the center of the list weighted_fragments = enweight(weighted_fragments) weighted_fragments = map( lambda fw: (fw[0], (1 - fw[1]) * len(fw[0][1]['transcript'])), weighted_fragments) # fragments with highest weights first weighted_fragments = sorted(weighted_fragments, key=lambda fw: fw[1], reverse=True) # strip weights weighted_fragments = list(map(lambda fw: fw[0], weighted_fragments)) for index, fragment in weighted_fragments: match = search.find_best(fragment['transcript'], start=start, end=end) match_start, match_end, sws_score, match_substitutions = match if sws_score > (n - 1) / (2 * n): fragment['match-start'] = match_start fragment['match-end'] = match_end fragment['sws'] = sws_score fragment['substitutions'] = match_substitutions for f in split_match(fragments[0:index], start=start, end=match_start): yield f yield fragment for f in split_match(fragments[index + 1:], start=match_end, end=end): yield f raise StopIteration for _, _ in weighted_fragments: yield None