def align(seq1, seq2): s1 = Sequence(seq1) s2 = Sequence(seq2) v = Vocabulary() s1Encoded = v.encodeSequence(s1) s2Encoded = v.encodeSequence(s2) return s1Encoded, s2Encoded, v
def align(self, seq1, seq2): """ Goal: Align seq2 (automatically detected conditions) with seq1 (truth conditions) and return the best alignment """ print("len(truth_conditions) = {}, len(detected_conditions) = {}".format(len(seq1), len(seq2))) from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence(seq1) b = Sequence(seq2) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) print alignment print 'Alignment score:', alignment.score print 'Percent identity:', alignment.percentIdentity() assert(alignment.percentIdentity() >= 97.0) first, second = list(alignment.first), list(alignment.second) break return second
def test_of_signs() -> None: vocabulary = Vocabulary() name = 1234 named = NamedSequence.of_signs(name, signs, vocabulary) assert named.name == str(name) assert named.sequence == vocabulary.encodeSequence(sequence)
def get_labels(self): """label each slot in the sausage (O=correct X=incorrect)""" if self.correct(): # everything is correct return ['O'] * self.num_slots() # align the ref and the best hyp a = Sequence(self.ref()) b = Sequence(self.best_hyp()) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignment = v.decodeSequenceAlignment(encodeds[0]) # get labels according to alignment labels = [] for a,b in zip(alignment.first, alignment.second): if a == b or a == '-' and b == '*DELETE*': labels.append('O') else: labels.append('X') return labels
def test_of_fragment() -> None: vocabulary = Vocabulary() fragment = FragmentFactory.build(signs=signs) named = NamedSequence.of_fragment(fragment, vocabulary) assert named.name == str(fragment.number) assert named.sequence == vocabulary.encodeSequence(sequence)
def seqToAlign(a, b, matchScore=3, mismatchScore=-1, gapScore=-2): ''' args: a: list of words b: list of words matchScore: num mismatchScore: num gapScore: num Returns: o/w returns list of tuples with score and top alignments Description: helper function for finding alignments given a list of words ''' # Create a vocabulary and encode the sequences. a = a[0] b = b[0] seq1 = Sequence(a) seq2 = Sequence(b) v = Vocabulary() aEncoded = v.encodeSequence(seq1) bEncoded = v.encodeSequence(seq2) # Create a scoring and align the sequences using local aligner. scoring = SimpleScoring(matchScore, mismatchScore) aligner = LocalSequenceAligner(scoring, gapScore) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds] return [(a.score, list(a.first), list(a.second)) for a in alignments]
def align(s1, s2): # Create sequences to be aligned. a = Sequence(s1.split()) b = Sequence(s2.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] alignment = v.decodeSequenceAlignment(encoded) correct_words = [] offset = 0 for i, (x, y) in enumerate(encoded): if x == y: correct_words.append(a[i - offset]) elif x == 0: offset += 1 return correct_words
def get_labels(self): """label each slot in the sausage (O=correct X=incorrect)""" if self.correct(): # everything is correct return ['O'] * self.num_slots() # align the ref and the best hyp a = Sequence(self.ref()) b = Sequence(self.best_hyp()) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignment = v.decodeSequenceAlignment(encodeds[0]) # get labels according to alignment labels = [] for a, b in zip(alignment.first, alignment.second): if a == b or a == '-' and b == '*DELETE*': labels.append('O') else: labels.append('X') return labels
def align_ref_long(hyp, ref): ''' Aligns a ref to a sausage-aligned hype using the align library ''' # align ref to hyp sr = Sequence(ref) sh = Sequence(hyp) v = Vocabulary() rEncoded = v.encodeSequence(sr) hEncoded = v.encodeSequence(sh) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(hEncoded, rEncoded, backtrace=True) # Iterate over optimal alignments and print them. alignment = v.decodeSequenceAlignment(encodeds[0]) ref_align_raw = [token[0] for token in alignment if token[0] != '-'] ref_align = [] for token in ref_align_raw: if token == '-': ref_align.append(delete_token) else: ref_align.append(token) for i in range(len(hyp) - len(ref_align_raw)): ref_align.append(delete_token) return ref_align
def align(trace1, trace2): # Create sequences to be aligned. a = Sequence(trace1) b = Sequence(trace2) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) align = str(alignment) # Convert aligned sequences into list # escaping multiple characters seq_size = len(align) half_size = seq_size / 2 # First Half s1 = align_to_list(align, 0, half_size, 4) # Second Half s2 = align_to_list(align, half_size, seq_size, 4) # return the lists as result return s1, s2
def align(sequence1, sequence2): # This is encoded because the aligner uses the dasy as a gap element sequence1 = ['<DASH />' if word == '-' else word for word in sequence1] sequence2 = ['<DASH />' if word == '-' else word for word in sequence2] # Create sequences to be aligned. a = Sequence(sequence1) b = Sequence(sequence2) #print(22) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) #print(33) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) #print(99) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) #print(34) # Create alignment object and return it alignment = v.decodeSequenceAlignment(encodeds[0]) return alignment
def score_align(x, y): a = Sequence(x) b = Sequence(y) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) pI = 0.0 for e in encodeds: alignment = v.decodeSequenceAlignment(e) pI = max(pI, alignment.percentIdentity()) return 1 - pI/100.0
def align_sequences(seq_a, seq_b): # Must escape '-' because alignment library uses it as a gap # marker. escaped_seq_a = ['\\-' if x == '-' else x for x in seq_a] escaped_seq_b = ['\\-' if x == '-' else x for x in seq_b] v = Vocabulary() encoded_a = v.encodeSequence(Sequence(escaped_seq_a)) encoded_b = v.encodeSequence(Sequence(escaped_seq_b)) scoring = SimpleScoring(matchScore=3, mismatchScore=-1) aligner = StrictGlobalSequenceAligner(scoring, gapScore=-2) _, encodeds = aligner.align(encoded_a, encoded_b, backtrace=True) return encodeds[0]
def score_align(x, y): a = Sequence(x) b = Sequence(y) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) pI = 0.0 for e in encodeds: alignment = v.decodeSequenceAlignment(e) pI = max(pI, alignment.percentIdentity()) return 1 - pI / 100.0
def getAlignment(timit, utterance): tim = list() for li in timit: for ph in li: tim.append(ph) a=Sequence(tim) b=Sequence(utterance) v=Vocabulary() aEnc=v.encodeSequence(a) bEnc=v.encodeSequence(b) scoring=SimpleScoring(2,-1) aligner=GlobalSequenceAligner(scoring,-2) score,encodeds= aligner.align(aEnc,bEnc,backtrace=True) for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) return alignment
def align(self, word, error): vocab = Vocabulary() a = vocab.encodeSequence(Sequence(word)) b = vocab.encodeSequence(Sequence(error)) score, encodings = self.aligner.align(a, b, backtrace=True) # Choose the highest-score alignment. score = -sys.maxsize best_alignment = None for encoding in encodings: alignment = vocab.decodeSequenceAlignment(encoding) if alignment.score > score: best_alignment = alignment score = alignment.score return best_alignment.first, best_alignment.second
def test_align_pair() -> None: vocabulary = Vocabulary() sequence_1 = NamedSequence.of_signs("name1", "ABZ001", vocabulary) sequence_2 = NamedSequence.of_signs("name2", "ABZ001", vocabulary) result = align_pair(sequence_1, sequence_2, vocabulary) assert result.score == match assert result.a == sequence_1 assert result.b == sequence_2 assert len(result.alignments) == 1
def test_alignment_result() -> None: vocabulary = Vocabulary() sequence_1 = NamedSequence.of_signs("name1", "ABZ001", vocabulary) sequence_2 = NamedSequence.of_signs("name2", "ABZ002", vocabulary) score = 10 alignments = [] result = AlignmentResult(score, sequence_1, sequence_2, alignments) assert result.score == score assert result.a == sequence_1 assert result.b == sequence_2 assert result.alignments == alignments
def getMatchesAlign(queries, ref, tr, cluster_centroids, cluster_sizes, optimal=False, threshold=0.97, verbose=0): """ Calls get_match for each sequence in queries queries is an Alignment object This is SLOW! returns a dict of {seq_name:(match_ID, similarity), ...} """ # results dict res = {} # Create a vocabulary and encode the sequences. vocabulary = Vocabulary() queries_aln = {} ref_aln = {} for seq_name in queries.Names: a = Sequence(queries.getSeq(seq_name)) a = vocabulary.encodeSequence(a) queries_aln[seq_name] = a for seq_name in ref.Names: a = Sequence(ref.getSeq(seq_name)) a = vocabulary.encodeSequence(a) ref_aln[seq_name] = a # Create a scoring and aligner scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) for seq_name in queries.Names: if verbose > 0: print "Searching seq", seq_name + ':' res[seq_name] = getMatchAlign(queries_aln[seq_name], ref_aln, tr, cluster_centroids, cluster_sizes, scoring, aligner, vocabulary, optimal=optimal, threshold=threshold, verbose=verbose) return res
def test_align() -> None: vocabulary = Vocabulary() sequence_1 = NamedSequence.of_signs("name1", "ABZ001", vocabulary) sequence_2 = NamedSequence.of_signs("name2", "ABZ001", vocabulary) sequence_3 = NamedSequence.of_signs("name3", "ABZ002", vocabulary) result = align([(sequence_1, sequence_3), (sequence_1, sequence_2)], vocabulary) assert_that( result, contains_exactly(has_properties({"score": 16}), has_properties({"score": 0})), )
def align_pair( first: NamedSequence, second: NamedSequence, vocabulary: Vocabulary, ) -> AlignmentResult: scoring = EblScoring(vocabulary) aligner = GlobalSequenceAligner(scoring, True) score, alignments = aligner.align(first.sequence, second.sequence, backtrace=True) return AlignmentResult( score, first, second, [vocabulary.decodeSequenceAlignment(encoded) for encoded in alignments], )
def recommendation(name, movies): """Find the top ten neartest match in a list of movie names Args: name: a string of key words seperated by white space dic: a list of movie names to choose from Returns: A list of movie names """ # Create sequences to be aligned. key = Sequence(name.split()) dic = [Sequence(movie.split()) for movie in movies] # Create a vocabulary and encode the sequences. v = Vocabulary() keyEncoded = v.encodeSequence(key) dicEncoded = [v.encodeSequence(movie) for movie in dic] # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(1, 0) aligner = GlobalSequenceAligner(scoring, -2) score = [aligner.align(keyEncoded, dEncoded, backtrace=False) for dEncoded in dicEncoded] # Get the top five score in all movies topFive = sorted(range(len(score)), key=lambda i:score[i]) return [ movies[i] for i in topFive ]
def match_word_sorted(code1, code2): """return the max scored alignment between the two input codes""" list1 = code1.split(" ") list2 = code2.split(" ") set1 = set(list1) set2 = set(list2) common_words = set1 & set2 try: common_words.remove("") except: pass words_to_index = {} for word in common_words: in1 = list1.index(word) in2 = list2.index(word) words_to_index[word] = (in1, in2) sorted1 = OrderedDict(sorted(words_to_index.items(), key=lambda t: t[1][0])).keys() sorted2 = OrderedDict(sorted(words_to_index.items(), key=lambda t: t[1][1])).keys() a = Sequence(sorted1) b = Sequence(sorted2) v = Vocabulary() a_encoded = v.encodeSequence(a) b_encoded = v.encodeSequence(b) scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE) aligner = GlobalSequenceAligner(scoring, GAP_SCORE) score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True) max_score = 0 for i, encoded in enumerate(encoders): alignment = v.decodeSequenceAlignment(encoded) if alignment.score > max_score: max_score = alignment.score return max_score
def align_fragment_and_chapter( fragment: Fragment, chapter: Chapter ) -> List[AlignmentResult]: vocabulary = Vocabulary() fragment_sequence = NamedSequence.of_fragment(fragment, vocabulary) pairs = [ ( fragment_sequence, NamedSequence.of_signs( chapter.manuscripts[index].siglum, signs, vocabulary ), ) for index, signs in enumerate(chapter.signs) if has_clear_signs(signs) ] return align(pairs, vocabulary)
def match_word_sorted(code1, code2): """return the max scored alignment between the two input codes""" list1 = code1.split(" ") list2 = code2.split(" ") set1 = set(list1) set2 = set(list2) common_words = set1 | set2 try: common_words.remove("") except: pass words1 = [] words2 = [] for word in common_words: words1 += index_word_pairs(word, list1) words2 += index_word_pairs(word, list2) sorted1 = sorted(words1, key=lambda t: t[1]) sorted2 = sorted(words2, key=lambda t: t[1]) a = Sequence(sorted1) b = Sequence(sorted2) v = Vocabulary() a_encoded = v.encodeSequence(a) b_encoded = v.encodeSequence(b) scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE) aligner = GlobalSequenceAligner(scoring, GAP_SCORE) score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True) max_score = 0 for i, encoded in enumerate(encoders): alignment = v.decodeSequenceAlignment(encoded) #print alignment #print 'Alignment score:', alignment.score #print 'Percent identity:', alignment.percentIdentity() if alignment.score > max_score: max_score = alignment.score return max_score
from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner from alignment.profile import Profile from alignment.profilealigner import SoftScoring, GlobalProfileAligner # Create sequences to be aligned. a = Sequence('what a beautiful day'.split()) b = Sequence('what a disappointingly bad day'.split()) print 'Sequence A:', a print 'Sequence B:', b print # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) print 'Encoded A:', aEncoded print 'Encoded B:', bEncoded print # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, alignments = aligner.align(aEncoded, bEncoded, backtrace=True) # Create sequence profiles out of alignments. profiles = [Profile.fromSequenceAlignment(a) for a in alignments] for encoded in profiles: profile = v.decodeProfile(encoded)
def test_utterance_transcriptions(self): print('Checking utterance transcriptions...') split_directory = self.corpus.split_directory() model_directory = self.trainer.align_directory with mp.Pool(processes=self.corpus.num_jobs) as pool: jobs = [(self, x) for x in range(self.corpus.num_jobs)] results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs] output = [p.get() for p in results] print('Utterance FSTs compiled!') print('Decoding utterances (this will take some time)...') results = [pool.apply_async(test_utterances_func, args=i) for i in jobs] output = [p.get() for p in results] print('Finished decoding utterances!') word_mapping = self.dictionary.reversed_word_mapping v = Vocabulary() errors = {} for job in range(self.corpus.num_jobs): text_path = os.path.join(split_directory, 'text.{}'.format(job)) texts = load_scp(text_path) aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job))) with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf: for utt, line in sorted(aligned_int.items()): text = [] for t in line: text.append(word_mapping[int(t)]) outf.write('{} {}\n'.format(utt, ' '.join(text))) ref_text = texts[utt] if len(text) < len(ref_text) - 7: insertions = [x for x in text if x not in ref_text] deletions = [x for x in ref_text if x not in text] else: aligned_seq = Sequence(text) ref_seq = Sequence(ref_text) alignedEncoded = v.encodeSequence(aligned_seq) refEncoded = v.encodeSequence(ref_seq) scoring = SimpleScoring(2, -1) a = GlobalSequenceAligner(scoring, -2) score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True) insertions = [] deletions = [] for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) for i, f in enumerate(alignment.first): s = alignment.second[i] if f == '-': insertions.append(s) if s == '-': deletions.append(f) if insertions or deletions: errors[utt] = (insertions, deletions, ref_text, text) if not errors: message = 'There were no utterances with transcription issues.' else: out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv') with open(out_path, 'w') as problemf: problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n') for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(), key=lambda x: -1 * ( len(x[1][1]) + len(x[1][2]))): problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions), ' '.join(ref_text), ' '.join(text))) message = 'There were {} of {} utterances with at least one transcription issue. '\ 'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path) print(self.transcription_analysis_template.format(message))
def of_signs(name, signs: str, vocabulary: Vocabulary) -> "NamedSequence": return NamedSequence(name, vocabulary.encodeSequence(make_sequence(signs)))
from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner from alignment.profile import Profile from alignment.profilealigner import SoftScoring, GlobalProfileAligner # Create sequences to be aligned. a = Sequence('what a beautiful day'.split()) b = Sequence('what a disappointingly bad day'.split()) print 'Sequence A:', a print 'Sequence B:', b print # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) print 'Encoded A:', aEncoded print 'Encoded B:', bEncoded print # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, alignments = aligner.align(aEncoded, bEncoded, backtrace=True) # Create sequence profiles out of alignments. profiles = [Profile.fromSequenceAlignment(a) for a in alignments] for encoded in profiles: profile = v.decodeProfile(encoded) print profile
########## SIMPLEST ####### import regex regex.search(r'\b(amazing){e<2}\s', 'is life amazing lie ao a') from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence('amazing'.split()) b = Sequence('what a amazing disappointingly bad day'.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) print alignment print 'Alignment score:', alignment.score print 'Percent identity:', alignment.percentIdentity() print
def __init__(self, vocabulary: Vocabulary): self.vocabulary = vocabulary self.line_break = vocabulary.encode(LINE_BREAK) self.x = vocabulary.encode(UNCLEAR_OR_UNKNOWN_SIGN)
def ScorePhonemes(self, source=[], target=[]): """Compare the phonemes of a source and target sentence and determine which of the target items were correctly transcribed Returns: hits_phonemes (nested list): list of bools corresponding to the accuracy of each phoneme in the target list for each sentence Note: This scoring method has no word accuracy awareness. Phonemes from correctly input words may wind up as labeled wrong ( i.e. target:"with the" source: "with a" alignement: ) Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9) """ if not source: source = self.source_phonemes if not target: target = self.target_phonemes self.source_matched = [] hits = [] for x, ttup in enumerate(target): tphon, twordnum, tword = zip(*ttup) stup = source[x] if not stup: hitlist = [False] * len(tphon) bPhonOut = ['-'] * len(tphon) else: sphon, swordnum, sword = zip(*stup) # Create sequences to be aligned. a = Sequence(tphon) b = Sequence(sphon) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] #Score based only on hits vs misses, insertions are ignored notInsert = encoded[:][0] != 0 nonInsertMatched = encoded[notInsert][:] #Find the alignment in the target sequence aSeq = nonInsertMatched[:][0] bSeq = nonInsertMatched[:][1] #Label all items not aligned to the target as false hitlist = [] y = 0 for y in range(0, len(aEncoded) - len(aSeq) + 1): aChunk = aEncoded[y:y + len(aSeq)] #print aChunk if sum(aChunk - aSeq) == 0: break hitlist.extend([False] * (y)) hitlist.extend(list(aSeq - bSeq == 0)) hitlist.extend([False] * (len(aEncoded) - y - len(aSeq))) #Export the target aligned phonemes of the source sequence bPhons = np.zeros(len(aEncoded), int) bPhons[y:y + len(bSeq)] = bSeq bPhonOut = np.array(v.elements())[bPhons].tolist() hits.append(hitlist) self.source_matched.append(bPhonOut) self.hits_phonemes = hits
def ScoreWords(self): """Aligns the words of the source sentence to match the target sentence to determine hit vs missed words Returns: hits (nested list): The target [0] and source [1] sentences in a nested list Note: Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9) """ target = self.target source = self.source self.source_matchWords = [] hits = [] wscore = np.empty(0) for tnum, tsent in enumerate(target): ssent = source[tnum] # Create sequences to be aligned. a = Sequence(tsent.split()) b = Sequence(ssent.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(5, -1) aligner = GlobalSequenceAligner(scoring, -1) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] #Score based only on hits vs misses, insertions are ignored notInsert = encoded[:][0] != 0 nonInsertMatched = encoded[notInsert][:] #Find the alignment in the target sequence aSeq = nonInsertMatched[:][0] bSeq = nonInsertMatched[:][1] #Label all items not aligned to the target as false hitlist = [] x = 0 for x in range(0, len(aEncoded) - len(aSeq) + 1): aChunk = aEncoded[x:x + len(aSeq)] #print aChunk if sum(aChunk - aSeq) == 0: break hitlist.extend([False] * (x)) hitlist.extend(list(aSeq - bSeq == 0)) hitlist.extend([False] * (len(aEncoded) - x - len(aSeq))) #Export the target aligned words of the source sequence bWords = np.zeros(len(aEncoded), int) bWords[x:x + len(bSeq)] = bSeq bWordOut = np.array(v.elements())[bWords].tolist() hits.append(hitlist) iwscore = sum(hitlist) * 100 / float(len(hitlist)) wscore = np.hstack([wscore, iwscore]) print bWordOut self.source_matchWords.append(bWordOut) self.hits = hits self.wscore = wscore
def __iter__(self): return (int(e) for e in self.elements) # Tests ----------------------------------------------------------------------- if __name__ == '__main__': s1 = Sequence('what a beautiful day'.split()) s2 = Sequence('what a disappointingly bad day'.split()) print('s1', s1) print('s2', s2) print('') from alignment.vocabulary import Vocabulary v = Vocabulary() e1 = v.encodeSequence(s1) e2 = v.encodeSequence(s2) print('v', v) print('e1', e1) print('e2', e2) print('') from alignment.sequencealigner import SimpleScoring from alignment.sequencealigner import GlobalSequenceAligner s = SimpleScoring(2, -1) a = GlobalSequenceAligner(s, -2) score, alignments = a.align(e1, e2, backtrace=True) for alignment in alignments: as1 = v.decodeSequence(alignment.first) as2 = v.decodeSequence(alignment.second)
from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence('what a beautiful day'.split()) b = Sequence('what a disappointingly bad day'.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) print alignment print 'Alignment score:', alignment.score print 'Percent identity:', alignment.percentIdentity() print
all_path_dists = pdist[triu_inds] med = np.median(all_path_dists) # %% [markdown] # ## # from skbio.sequence import Sequence from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary seqs = [] for p in paths: s = Sequence(p) seqs.append(s) v = Vocabulary() encoded_seqs = [v.encodeSequence(s) for s in seqs] class SimpleScoring: def __init__(self, matchScore, mismatchScore): self.matchScore = matchScore self.mismatchScore = mismatchScore def __call__(self, firstElement, secondElement): if firstElement == secondElement: return self.matchScore else: return self.mismatchScore
sequence_family = np.array(sequence_family) name = [] spilt_pos = [] for i in range(len(sequence_family)): if sequence_family[i][0][0] == '[': name.append(sequence_family[i][0][1:-1]) spilt_pos.append(i) sequence = [] for i in spilt_pos: ss = sequence_family[i + 1][0] for ii in range(i + 2, i + 9): ss = ss + sequence_family[ii][0] sequence.append(ss) #%% v = Vocabulary() sequence_encoded = [] for i in range(len(sequence)): sequence_encoded.append( v.encodeSequence(Sequence(split_sequence(sequence[i])))) scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) Matrix = np.zeros(9 * 9).reshape(9, 9) for i in range(len(sequence_encoded)): for j in range(i + 1, len(sequence_encoded)): score, encodeds = aligner.align(sequence_encoded[i], sequence_encoded[j], backtrace=True) for encoded in encodeds:
def text_to_text_alignment_and_score(text_ref, text_pred): """ Find a word to word alignment between two texts, considering the first is the reference and the second the predicted :param text_ref: text reference :param text_pred: predicted text :return: """ text_ref = text_ref.lower() text_pred = text_pred.lower() iterable = [".", ","] # convert the reference text in order not to contain , and (junk characters) translation_map = str.maketrans(to_translation_map(iterable)) text_ref = text_ref.translate(translation_map) # Create sequences to be aligned. a = Sequence(text_ref.split()) b = Sequence(text_pred.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() a_enc = v.encodeSequence(a) b_enc = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(1, 0) aligner = GlobalSequenceAligner(scoring, 0) f, score, encodeds = aligner.align(a_enc, b_enc, text_ref.split(), text_pred.split(), backtrace=True) # get the first alignment if exists: #print(encodeds[0]) print(encodeds) if len(encodeds[0]) > 0: alignment = v.decodeSequenceAlignment(encodeds[0]) print(alignment) ##fix first and last missing words of asr text list_asr = [] list_pred = [] for word in text_pred.split(): if word != alignment.second.elements[0]: list_asr.append(word) list_pred.append('-') else: alignment.second.elements = list_asr + alignment.second.elements alignment.first.elements = list_pred + alignment.first.elements break list_asr = [] list_pred = [] for word in reversed(text_pred.split()): if word != alignment.second.elements[-1]: list_asr = [word] + list_asr list_pred.append('-') else: alignment.second.elements = alignment.second.elements + list_asr alignment.first.elements = alignment.first.elements + list_pred break #fix first and last missing words of reference text list_asr = [] list_pred = [] for word in text_ref.split(): if word != alignment.first.elements[0]: list_pred.append(word) list_asr.append('-') else: alignment.second.elements = list_asr + alignment.second.elements alignment.first.elements = list_pred + alignment.first.elements break list_asr = [] list_pred = [] for word in reversed(text_ref.split()): if word != alignment.first.elements[-1]: list_pred = [word] + list_asr list_asr.append('-') else: alignment.second.elements = alignment.second.elements + list_asr alignment.first.elements = alignment.first.elements + list_pred break #print(alignment.second.elements) #print(alignment.first.elements) print(alignment) rec = alignment.score * 100 / len(text_ref.split()) pre = alignment.score * 100 / len(text_pred.split()) else: alignment = [] rec, pre = 0, 0 return alignment, rec, pre
cur.execute(selectStatement2) lyrics2 = cur.fetchone() cur.execute(selectStatement3) lyrics3 = cur.fetchone() scoring = SimpleScoring(2, -2) aligner = LocalSequenceAligner(scoring, -2) a = Sequence(lyrics1[0].split(" ")) b = Sequence(lyrics2[0].split(" ")) c = Sequence(lyrics3[0].split(" ")) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) cEncoded = v.encodeSequence(c) print "RUN DMC VS BIGGIE SMALLS" #Create a scoring and align sequences using the loacl aligner. score, encodeds = aligner.align(aEncoded, cEncoded, backtrace=True) #Iterate over optimal alignments and print them. if alignment = v.decodeSequenceAlignment(encodeds[0]) print alignment print 'Alignment score:', alignment.score print 'Percent identity:', alignment.percentIdentity()