Пример #1
0
def align(seq1, seq2):
  s1 = Sequence(seq1)
  s2 = Sequence(seq2)
  v = Vocabulary()
  s1Encoded = v.encodeSequence(s1)
  s2Encoded = v.encodeSequence(s2)
  return s1Encoded, s2Encoded, v
Пример #2
0
	def align(self, seq1, seq2):
		"""
		Goal: Align seq2 (automatically detected conditions) with seq1 (truth conditions) and return the best alignment
		"""
		print("len(truth_conditions) = {}, len(detected_conditions) = {}".format(len(seq1), len(seq2)))
		from alignment.sequence import Sequence
		from alignment.vocabulary import Vocabulary
		from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

		# Create sequences to be aligned.
		a = Sequence(seq1)
		b = Sequence(seq2)

		# Create a vocabulary and encode the sequences.
		v = Vocabulary()
		aEncoded = v.encodeSequence(a)
		bEncoded = v.encodeSequence(b)

		# Create a scoring and align the sequences using global aligner.
		scoring = SimpleScoring(2, -1)
		aligner = GlobalSequenceAligner(scoring, -2)
		score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

		# Iterate over optimal alignments and print them.
		for encoded in encodeds:
			alignment = v.decodeSequenceAlignment(encoded)
			print alignment
			print 'Alignment score:', alignment.score
			print 'Percent identity:', alignment.percentIdentity()
			assert(alignment.percentIdentity() >= 97.0)
			first, second = list(alignment.first), list(alignment.second)
			break
		return second
Пример #3
0
def test_of_signs() -> None:
    vocabulary = Vocabulary()
    name = 1234
    named = NamedSequence.of_signs(name, signs, vocabulary)

    assert named.name == str(name)
    assert named.sequence == vocabulary.encodeSequence(sequence)
Пример #4
0
    def get_labels(self):
        """label each slot in the sausage (O=correct X=incorrect)"""
        if self.correct():
            # everything is correct
            return ['O'] * self.num_slots()

        # align the ref and the best hyp
        a = Sequence(self.ref())
        b = Sequence(self.best_hyp())
        v = Vocabulary()
        aEncoded = v.encodeSequence(a)
        bEncoded = v.encodeSequence(b)
        scoring = SimpleScoring(2, -1)
        aligner = StrictGlobalSequenceAligner(scoring, -2)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
        alignment = v.decodeSequenceAlignment(encodeds[0])

        # get labels according to alignment
        labels = []
        for a,b in zip(alignment.first, alignment.second):
            if a == b or a == '-' and b == '*DELETE*':
                labels.append('O')
            else:
                labels.append('X')
        return labels
Пример #5
0
def test_of_fragment() -> None:
    vocabulary = Vocabulary()
    fragment = FragmentFactory.build(signs=signs)
    named = NamedSequence.of_fragment(fragment, vocabulary)

    assert named.name == str(fragment.number)
    assert named.sequence == vocabulary.encodeSequence(sequence)
def seqToAlign(a, b, matchScore=3, mismatchScore=-1, gapScore=-2):
    '''
    args:
        a: list of words
        b: list of words
        matchScore: num
        mismatchScore: num
        gapScore: num
    Returns:
        o/w returns list of tuples with score and top alignments
    Description:
        helper function for finding alignments given a list of words
    '''
    # Create a vocabulary and encode the sequences.
    a = a[0]
    b = b[0]
    seq1 = Sequence(a)
    seq2 = Sequence(b)
    v = Vocabulary()
    aEncoded = v.encodeSequence(seq1)
    bEncoded = v.encodeSequence(seq2)

    # Create a scoring and align the sequences using local aligner.
    scoring = SimpleScoring(matchScore, mismatchScore)
    aligner = LocalSequenceAligner(scoring, gapScore)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds]

    return [(a.score, list(a.first), list(a.second)) for a in alignments]
Пример #7
0
def align(s1, s2):
    # Create sequences to be aligned.
    a = Sequence(s1.split())
    b = Sequence(s2.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    encoded = encodeds[0]
    alignment = v.decodeSequenceAlignment(encoded)
    correct_words = []
    offset = 0
    for i, (x, y) in enumerate(encoded):
        if x == y:
            correct_words.append(a[i - offset])
        elif x == 0:
            offset += 1

    return correct_words
Пример #8
0
    def get_labels(self):
        """label each slot in the sausage (O=correct X=incorrect)"""
        if self.correct():
            # everything is correct
            return ['O'] * self.num_slots()

        # align the ref and the best hyp
        a = Sequence(self.ref())
        b = Sequence(self.best_hyp())
        v = Vocabulary()
        aEncoded = v.encodeSequence(a)
        bEncoded = v.encodeSequence(b)
        scoring = SimpleScoring(2, -1)
        aligner = StrictGlobalSequenceAligner(scoring, -2)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
        alignment = v.decodeSequenceAlignment(encodeds[0])

        # get labels according to alignment
        labels = []
        for a, b in zip(alignment.first, alignment.second):
            if a == b or a == '-' and b == '*DELETE*':
                labels.append('O')
            else:
                labels.append('X')
        return labels
Пример #9
0
def align(s1, s2):
    # Create sequences to be aligned.
    a = Sequence(s1.split())
    b = Sequence(s2.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    encoded = encodeds[0]
    alignment = v.decodeSequenceAlignment(encoded)
    correct_words = []
    offset = 0
    for i, (x, y) in enumerate(encoded):
        if x == y:
            correct_words.append(a[i - offset])
        elif x == 0:
            offset += 1

    return correct_words
Пример #10
0
def align_ref_long(hyp, ref):
    ''' Aligns a ref to a sausage-aligned hype 
        using the align library '''
    
    # align ref to hyp
    sr = Sequence(ref)
    sh = Sequence(hyp)
    
    v = Vocabulary()
    rEncoded = v.encodeSequence(sr)
    hEncoded = v.encodeSequence(sh)
    
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = StrictGlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(hEncoded, rEncoded, backtrace=True)
    

    # Iterate over optimal alignments and print them.
    alignment = v.decodeSequenceAlignment(encodeds[0])
    ref_align_raw = [token[0] for token in alignment if token[0] != '-']
    
    ref_align  = []
    for token in ref_align_raw:
        if token == '-':
            ref_align.append(delete_token)
        else:
            ref_align.append(token)
            
    for i in range(len(hyp) - len(ref_align_raw)):
        ref_align.append(delete_token)
        
    return ref_align
def align(trace1, trace2):

    # Create sequences to be aligned.
    a = Sequence(trace1)
    b = Sequence(trace2)

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    # Iterate over optimal alignments and print them.
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
        align = str(alignment)

    # Convert aligned sequences into list
    # escaping multiple characters

    seq_size = len(align)
    half_size = seq_size / 2

    # First Half
    s1 = align_to_list(align, 0, half_size, 4)

    # Second Half
    s2 = align_to_list(align, half_size, seq_size, 4)

    # return the lists as result
    return s1, s2
Пример #12
0
def align(sequence1, sequence2):

    # This is encoded because the aligner uses the dasy as a gap element
    sequence1 = ['<DASH />' if word == '-' else word for word in sequence1]
    sequence2 = ['<DASH />' if word == '-' else word for word in sequence2]

    # Create sequences to be aligned.
    a = Sequence(sequence1)
    b = Sequence(sequence2)

    #print(22)

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    #print(33)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)

    #print(99)

    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    #print(34)

    # Create alignment object and return it
    alignment = v.decodeSequenceAlignment(encodeds[0])
    return alignment
Пример #13
0
def score_align(x, y):
    a = Sequence(x)
    b = Sequence(y)
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    pI = 0.0
    for e in encodeds:
        alignment = v.decodeSequenceAlignment(e)
        pI = max(pI, alignment.percentIdentity())
    return 1 - pI/100.0
Пример #14
0
def align_sequences(seq_a, seq_b):
    # Must escape '-' because alignment library uses it as a gap
    # marker.
    escaped_seq_a = ['\\-' if x == '-' else x for x in seq_a]
    escaped_seq_b = ['\\-' if x == '-' else x for x in seq_b]

    v = Vocabulary()
    encoded_a = v.encodeSequence(Sequence(escaped_seq_a))
    encoded_b = v.encodeSequence(Sequence(escaped_seq_b))

    scoring = SimpleScoring(matchScore=3, mismatchScore=-1)
    aligner = StrictGlobalSequenceAligner(scoring, gapScore=-2)
    _, encodeds = aligner.align(encoded_a, encoded_b, backtrace=True)
    return encodeds[0]
 def score_align(x, y):
     a = Sequence(x)
     b = Sequence(y)
     v = Vocabulary()
     aEncoded = v.encodeSequence(a)
     bEncoded = v.encodeSequence(b)
     scoring = SimpleScoring(2, -1)
     aligner = GlobalSequenceAligner(scoring, -2)
     score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
     pI = 0.0
     for e in encodeds:
         alignment = v.decodeSequenceAlignment(e)
         pI = max(pI, alignment.percentIdentity())
     return 1 - pI / 100.0
Пример #16
0
def getAlignment(timit, utterance):
    tim = list()
    for li in timit:
        for ph in li:
            tim.append(ph)
    a=Sequence(tim)
    b=Sequence(utterance)
    v=Vocabulary()
    aEnc=v.encodeSequence(a)
    bEnc=v.encodeSequence(b)
    scoring=SimpleScoring(2,-1)
    aligner=GlobalSequenceAligner(scoring,-2)
    score,encodeds= aligner.align(aEnc,bEnc,backtrace=True)
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
        return alignment
Пример #17
0
    def align(self, word, error):
        vocab = Vocabulary()
        a = vocab.encodeSequence(Sequence(word))
        b = vocab.encodeSequence(Sequence(error))
        score, encodings = self.aligner.align(a, b, backtrace=True)
    
        # Choose the highest-score alignment.
        score = -sys.maxsize
        best_alignment = None
        for encoding in encodings:
            alignment = vocab.decodeSequenceAlignment(encoding)
            if alignment.score > score:
                best_alignment = alignment
                score = alignment.score

        return best_alignment.first, best_alignment.second
def test_align_pair() -> None:
    vocabulary = Vocabulary()
    sequence_1 = NamedSequence.of_signs("name1", "ABZ001", vocabulary)
    sequence_2 = NamedSequence.of_signs("name2", "ABZ001", vocabulary)

    result = align_pair(sequence_1, sequence_2, vocabulary)

    assert result.score == match
    assert result.a == sequence_1
    assert result.b == sequence_2
    assert len(result.alignments) == 1
def test_alignment_result() -> None:
    vocabulary = Vocabulary()
    sequence_1 = NamedSequence.of_signs("name1", "ABZ001", vocabulary)
    sequence_2 = NamedSequence.of_signs("name2", "ABZ002", vocabulary)
    score = 10
    alignments = []
    result = AlignmentResult(score, sequence_1, sequence_2, alignments)

    assert result.score == score
    assert result.a == sequence_1
    assert result.b == sequence_2
    assert result.alignments == alignments
Пример #20
0
def getMatchesAlign(queries, ref, tr, cluster_centroids, 
			  cluster_sizes, optimal=False, threshold=0.97, verbose=0):
	""" Calls get_match for each sequence in queries
		queries is an Alignment object
		
		This is SLOW!
		
		returns a dict of {seq_name:(match_ID, similarity), ...}
	"""
	# results dict
	res = {}

	# Create a vocabulary and encode the sequences.
	vocabulary = Vocabulary()
	queries_aln = {}
	ref_aln = {}
	for seq_name in queries.Names:
		a = Sequence(queries.getSeq(seq_name))
		a = vocabulary.encodeSequence(a)
		queries_aln[seq_name] = a

	for seq_name in ref.Names:
		a = Sequence(ref.getSeq(seq_name))
		a = vocabulary.encodeSequence(a)
		ref_aln[seq_name] = a

	# Create a scoring and aligner
	scoring = SimpleScoring(2, -1)
	aligner = GlobalSequenceAligner(scoring, -2)

	for seq_name in queries.Names:
		if verbose > 0:
			print "Searching seq", seq_name + ':'
		res[seq_name] = getMatchAlign(queries_aln[seq_name], ref_aln, tr,
				cluster_centroids, cluster_sizes,
				scoring, aligner, vocabulary,
				optimal=optimal, threshold=threshold, verbose=verbose) 

	return res
def test_align() -> None:
    vocabulary = Vocabulary()
    sequence_1 = NamedSequence.of_signs("name1", "ABZ001", vocabulary)
    sequence_2 = NamedSequence.of_signs("name2", "ABZ001", vocabulary)
    sequence_3 = NamedSequence.of_signs("name3", "ABZ002", vocabulary)

    result = align([(sequence_1, sequence_3), (sequence_1, sequence_2)],
                   vocabulary)

    assert_that(
        result,
        contains_exactly(has_properties({"score": 16}),
                         has_properties({"score": 0})),
    )
Пример #22
0
def align_pair(
    first: NamedSequence,
    second: NamedSequence,
    vocabulary: Vocabulary,
) -> AlignmentResult:
    scoring = EblScoring(vocabulary)
    aligner = GlobalSequenceAligner(scoring, True)
    score, alignments = aligner.align(first.sequence, second.sequence, backtrace=True)
    return AlignmentResult(
        score,
        first,
        second,
        [vocabulary.decodeSequenceAlignment(encoded) for encoded in alignments],
    )
Пример #23
0
def recommendation(name, movies):
    """Find the top ten neartest match in a list of movie names
    
    Args:
        name: a string of key words seperated by white space
        dic: a list of movie names to choose from
    
    Returns:
        A list of movie names
    """
    # Create sequences to be aligned.
    key = Sequence(name.split())
    dic = [Sequence(movie.split()) for movie in movies]
    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    keyEncoded = v.encodeSequence(key)
    dicEncoded = [v.encodeSequence(movie) for movie in dic]
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(1, 0)
    aligner = GlobalSequenceAligner(scoring, -2)
    score = [aligner.align(keyEncoded, dEncoded, backtrace=False) for dEncoded in dicEncoded]
    # Get the top five score in all movies 
    topFive = sorted(range(len(score)), key=lambda i:score[i])
    return [ movies[i] for i in topFive ]
def match_word_sorted(code1, code2):
    """return the max scored alignment between the two input codes"""
    list1 = code1.split(" ")
    list2 = code2.split(" ")
    set1 = set(list1)
    set2 = set(list2)
    common_words = set1 & set2
    try:
        common_words.remove("")
    except:
        pass

    words_to_index = {}
    for word in common_words:
        in1 = list1.index(word)
        in2 = list2.index(word)
        words_to_index[word] = (in1, in2)
    sorted1 = OrderedDict(sorted(words_to_index.items(),
                                 key=lambda t: t[1][0])).keys()
    sorted2 = OrderedDict(sorted(words_to_index.items(),
                                 key=lambda t: t[1][1])).keys()

    a = Sequence(sorted1)
    b = Sequence(sorted2)
    v = Vocabulary()
    a_encoded = v.encodeSequence(a)
    b_encoded = v.encodeSequence(b)
    scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE)
    aligner = GlobalSequenceAligner(scoring, GAP_SCORE)
    score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True)
    max_score = 0
    for i, encoded in enumerate(encoders):
        alignment = v.decodeSequenceAlignment(encoded)
        if alignment.score > max_score:
            max_score = alignment.score
    return max_score
def align_fragment_and_chapter(
    fragment: Fragment, chapter: Chapter
) -> List[AlignmentResult]:
    vocabulary = Vocabulary()
    fragment_sequence = NamedSequence.of_fragment(fragment, vocabulary)

    pairs = [
        (
            fragment_sequence,
            NamedSequence.of_signs(
                chapter.manuscripts[index].siglum, signs, vocabulary
            ),
        )
        for index, signs in enumerate(chapter.signs)
        if has_clear_signs(signs)
    ]

    return align(pairs, vocabulary)
Пример #26
0
def match_word_sorted(code1, code2):
    """return the max scored alignment between the two input codes"""
    list1 = code1.split(" ")
    list2 = code2.split(" ")
    set1 = set(list1)
    set2 = set(list2)
    common_words = set1 | set2
    try:
        common_words.remove("")
    except:
        pass

    words1 = []
    words2 = []
    for word in common_words:
        words1 += index_word_pairs(word, list1)
        words2 += index_word_pairs(word, list2)
    sorted1 = sorted(words1, key=lambda t: t[1])
    sorted2 = sorted(words2, key=lambda t: t[1])

    a = Sequence(sorted1)
    b = Sequence(sorted2)
    v = Vocabulary()
    a_encoded = v.encodeSequence(a)
    b_encoded = v.encodeSequence(b)
    scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE)
    aligner = GlobalSequenceAligner(scoring, GAP_SCORE)
    score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True)
    max_score = 0
    for i, encoded in enumerate(encoders):
        alignment = v.decodeSequenceAlignment(encoded)
        #print alignment
        #print 'Alignment score:', alignment.score
        #print 'Percent identity:', alignment.percentIdentity()
        if alignment.score > max_score:
            max_score = alignment.score
    return max_score
Пример #27
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner
from alignment.profile import Profile
from alignment.profilealigner import SoftScoring, GlobalProfileAligner


# Create sequences to be aligned.
a = Sequence('what a beautiful day'.split())
b = Sequence('what a disappointingly bad day'.split())
print 'Sequence A:', a
print 'Sequence B:', b
print

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)
print 'Encoded A:', aEncoded
print 'Encoded B:', bEncoded
print

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, alignments = aligner.align(aEncoded, bEncoded, backtrace=True)

# Create sequence profiles out of alignments.
profiles = [Profile.fromSequenceAlignment(a) for a in alignments]
for encoded in profiles:
    profile = v.decodeProfile(encoded)
Пример #28
0
    def test_utterance_transcriptions(self):
        print('Checking utterance transcriptions...')

        split_directory = self.corpus.split_directory()
        model_directory = self.trainer.align_directory
        with mp.Pool(processes=self.corpus.num_jobs) as pool:
            jobs = [(self, x)
                    for x in range(self.corpus.num_jobs)]
            results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Utterance FSTs compiled!')
            print('Decoding utterances (this will take some time)...')
            results = [pool.apply_async(test_utterances_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Finished decoding utterances!')

        word_mapping = self.dictionary.reversed_word_mapping
        v = Vocabulary()
        errors = {}

        for job in range(self.corpus.num_jobs):
            text_path = os.path.join(split_directory, 'text.{}'.format(job))
            texts = load_scp(text_path)
            aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job)))
            with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf:
                for utt, line in sorted(aligned_int.items()):
                    text = []
                    for t in line:
                        text.append(word_mapping[int(t)])
                    outf.write('{} {}\n'.format(utt, ' '.join(text)))
                    ref_text = texts[utt]
                    if len(text) < len(ref_text) - 7:
                        insertions = [x for x in text if x not in ref_text]
                        deletions = [x for x in ref_text if x not in text]
                    else:
                        aligned_seq = Sequence(text)
                        ref_seq = Sequence(ref_text)

                        alignedEncoded = v.encodeSequence(aligned_seq)
                        refEncoded = v.encodeSequence(ref_seq)
                        scoring = SimpleScoring(2, -1)
                        a = GlobalSequenceAligner(scoring, -2)
                        score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True)
                        insertions = []
                        deletions = []
                        for encoded in encodeds:
                            alignment = v.decodeSequenceAlignment(encoded)
                            for i, f in enumerate(alignment.first):
                                s = alignment.second[i]
                                if f == '-':
                                    insertions.append(s)
                                if s == '-':
                                    deletions.append(f)
                    if insertions or deletions:
                        errors[utt] = (insertions, deletions, ref_text, text)
        if not errors:
            message = 'There were no utterances with transcription issues.'
        else:
            out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv')
            with open(out_path, 'w') as problemf:
                problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n')
                for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(),
                                                                           key=lambda x: -1 * (
                                                                                   len(x[1][1]) + len(x[1][2]))):
                    problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions),
                                                             ' '.join(ref_text), ' '.join(text)))
            message = 'There were {} of {} utterances with at least one transcription issue. '\
                  'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path)

        print(self.transcription_analysis_template.format(message))
 def of_signs(name, signs: str, vocabulary: Vocabulary) -> "NamedSequence":
     return NamedSequence(name,
                          vocabulary.encodeSequence(make_sequence(signs)))
Пример #30
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner
from alignment.profile import Profile
from alignment.profilealigner import SoftScoring, GlobalProfileAligner

# Create sequences to be aligned.
a = Sequence('what a beautiful day'.split())
b = Sequence('what a disappointingly bad day'.split())
print 'Sequence A:', a
print 'Sequence B:', b
print

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)
print 'Encoded A:', aEncoded
print 'Encoded B:', bEncoded
print

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, alignments = aligner.align(aEncoded, bEncoded, backtrace=True)

# Create sequence profiles out of alignments.
profiles = [Profile.fromSequenceAlignment(a) for a in alignments]
for encoded in profiles:
    profile = v.decodeProfile(encoded)
    print profile
Пример #31
0
########## SIMPLEST #######
import regex
regex.search(r'\b(amazing){e<2}\s', 'is life amazing lie ao a')


from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('amazing'.split())
b = Sequence('what a amazing disappointingly bad day'.split())

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
for encoded in encodeds:
    alignment = v.decodeSequenceAlignment(encoded)
    print alignment
    print 'Alignment score:', alignment.score
    print 'Percent identity:', alignment.percentIdentity()
    print
Пример #32
0
 def __init__(self, vocabulary: Vocabulary):
     self.vocabulary = vocabulary
     self.line_break = vocabulary.encode(LINE_BREAK)
     self.x = vocabulary.encode(UNCLEAR_OR_UNKNOWN_SIGN)
Пример #33
0
    def ScorePhonemes(self, source=[], target=[]):
        """Compare the phonemes of a source and target sentence and determine 
        which of the target items were correctly transcribed
    
        Returns:
            hits_phonemes (nested list): list of bools corresponding to the accuracy
            of each phoneme in the target list for each sentence
        Note:
        This scoring method has no word accuracy awareness. Phonemes from correctly input
        words may wind up as labeled wrong ( i.e. target:"with the" source: "with a" alignement: )
        Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9)        
        """
        if not source:
            source = self.source_phonemes
        if not target:
            target = self.target_phonemes

        self.source_matched = []
        hits = []
        for x, ttup in enumerate(target):
            tphon, twordnum, tword = zip(*ttup)
            stup = source[x]
            if not stup:
                hitlist = [False] * len(tphon)
                bPhonOut = ['-'] * len(tphon)
            else:
                sphon, swordnum, sword = zip(*stup)
                # Create sequences to be aligned.
                a = Sequence(tphon)
                b = Sequence(sphon)

                # Create a vocabulary and encode the sequences.
                v = Vocabulary()
                aEncoded = v.encodeSequence(a)
                bEncoded = v.encodeSequence(b)

                # Create a scoring and align the sequences using global aligner.
                scoring = SimpleScoring(2, -1)
                aligner = GlobalSequenceAligner(scoring, -2)
                score, encodeds = aligner.align(aEncoded,
                                                bEncoded,
                                                backtrace=True)
                encoded = encodeds[0]

                #Score based only on hits vs misses, insertions are ignored
                notInsert = encoded[:][0] != 0
                nonInsertMatched = encoded[notInsert][:]

                #Find the alignment in the target sequence
                aSeq = nonInsertMatched[:][0]
                bSeq = nonInsertMatched[:][1]

                #Label all items not aligned to the target as false
                hitlist = []
                y = 0
                for y in range(0, len(aEncoded) - len(aSeq) + 1):
                    aChunk = aEncoded[y:y + len(aSeq)]
                    #print aChunk
                    if sum(aChunk - aSeq) == 0:
                        break
                hitlist.extend([False] * (y))
                hitlist.extend(list(aSeq - bSeq == 0))
                hitlist.extend([False] * (len(aEncoded) - y - len(aSeq)))
                #Export the target aligned phonemes of the source sequence
                bPhons = np.zeros(len(aEncoded), int)
                bPhons[y:y + len(bSeq)] = bSeq
                bPhonOut = np.array(v.elements())[bPhons].tolist()
            hits.append(hitlist)
            self.source_matched.append(bPhonOut)
            self.hits_phonemes = hits
Пример #34
0
    def ScoreWords(self):
        """Aligns the words of the source sentence to match the target sentence
        to determine hit vs missed words
    
        Returns:
           hits (nested list): The target [0] and source [1] sentences in a nested list 
    
        Note:
        Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9)
        """
        target = self.target
        source = self.source
        self.source_matchWords = []
        hits = []
        wscore = np.empty(0)
        for tnum, tsent in enumerate(target):
            ssent = source[tnum]
            # Create sequences to be aligned.
            a = Sequence(tsent.split())
            b = Sequence(ssent.split())

            # Create a vocabulary and encode the sequences.
            v = Vocabulary()
            aEncoded = v.encodeSequence(a)
            bEncoded = v.encodeSequence(b)

            # Create a scoring and align the sequences using global aligner.
            scoring = SimpleScoring(5, -1)
            aligner = GlobalSequenceAligner(scoring, -1)
            score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
            encoded = encodeds[0]

            #Score based only on hits vs misses, insertions are ignored
            notInsert = encoded[:][0] != 0
            nonInsertMatched = encoded[notInsert][:]

            #Find the alignment in the target sequence
            aSeq = nonInsertMatched[:][0]
            bSeq = nonInsertMatched[:][1]

            #Label all items not aligned to the target as false
            hitlist = []
            x = 0
            for x in range(0, len(aEncoded) - len(aSeq) + 1):
                aChunk = aEncoded[x:x + len(aSeq)]
                #print aChunk
                if sum(aChunk - aSeq) == 0:
                    break
            hitlist.extend([False] * (x))
            hitlist.extend(list(aSeq - bSeq == 0))
            hitlist.extend([False] * (len(aEncoded) - x - len(aSeq)))
            #Export the target aligned words of the source sequence
            bWords = np.zeros(len(aEncoded), int)
            bWords[x:x + len(bSeq)] = bSeq
            bWordOut = np.array(v.elements())[bWords].tolist()
            hits.append(hitlist)
            iwscore = sum(hitlist) * 100 / float(len(hitlist))
            wscore = np.hstack([wscore, iwscore])
            print bWordOut
            self.source_matchWords.append(bWordOut)
            self.hits = hits
            self.wscore = wscore
Пример #35
0
    def __iter__(self):
        return (int(e) for e in self.elements)


# Tests -----------------------------------------------------------------------

if __name__ == '__main__':
    s1 = Sequence('what a beautiful day'.split())
    s2 = Sequence('what a disappointingly bad day'.split())
    print('s1', s1)
    print('s2', s2)
    print('')

    from alignment.vocabulary import Vocabulary
    v = Vocabulary()
    e1 = v.encodeSequence(s1)
    e2 = v.encodeSequence(s2)
    print('v', v)
    print('e1', e1)
    print('e2', e2)
    print('')

    from alignment.sequencealigner import SimpleScoring
    from alignment.sequencealigner import GlobalSequenceAligner
    s = SimpleScoring(2, -1)
    a = GlobalSequenceAligner(s, -2)
    score, alignments = a.align(e1, e2, backtrace=True)
    for alignment in alignments:
        as1 = v.decodeSequence(alignment.first)
        as2 = v.decodeSequence(alignment.second)
Пример #36
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('what a beautiful day'.split())
b = Sequence('what a disappointingly bad day'.split())

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
for encoded in encodeds:
    alignment = v.decodeSequenceAlignment(encoded)
    print alignment
    print 'Alignment score:', alignment.score
    print 'Percent identity:', alignment.percentIdentity()
    print

Пример #37
0
all_path_dists = pdist[triu_inds]

med = np.median(all_path_dists)
# %% [markdown]
# ##

# from skbio.sequence import Sequence
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary

seqs = []
for p in paths:
    s = Sequence(p)
    seqs.append(s)

v = Vocabulary()
encoded_seqs = [v.encodeSequence(s) for s in seqs]


class SimpleScoring:
    def __init__(self, matchScore, mismatchScore):
        self.matchScore = matchScore
        self.mismatchScore = mismatchScore

    def __call__(self, firstElement, secondElement):
        if firstElement == secondElement:
            return self.matchScore
        else:
            return self.mismatchScore

Пример #38
0
sequence_family = np.array(sequence_family)
name = []
spilt_pos = []
for i in range(len(sequence_family)):
    if sequence_family[i][0][0] == '[':
        name.append(sequence_family[i][0][1:-1])
        spilt_pos.append(i)
sequence = []
for i in spilt_pos:
    ss = sequence_family[i + 1][0]
    for ii in range(i + 2, i + 9):
        ss = ss + sequence_family[ii][0]
    sequence.append(ss)

#%%
v = Vocabulary()
sequence_encoded = []
for i in range(len(sequence)):
    sequence_encoded.append(
        v.encodeSequence(Sequence(split_sequence(sequence[i]))))

scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)

Matrix = np.zeros(9 * 9).reshape(9, 9)
for i in range(len(sequence_encoded)):
    for j in range(i + 1, len(sequence_encoded)):
        score, encodeds = aligner.align(sequence_encoded[i],
                                        sequence_encoded[j],
                                        backtrace=True)
        for encoded in encodeds:
Пример #39
0
def text_to_text_alignment_and_score(text_ref, text_pred):
    """
    Find a word to word alignment between two texts, considering the first is 
    the reference and the second the predicted
    :param text_ref: text reference
    :param text_pred: predicted text
    :return: 
    """

    text_ref = text_ref.lower()
    text_pred = text_pred.lower()
    iterable = [".", ","]
    # convert the reference text in order not to contain , and (junk characters)
    translation_map = str.maketrans(to_translation_map(iterable))
    text_ref = text_ref.translate(translation_map)

    # Create sequences to be aligned.
    a = Sequence(text_ref.split())
    b = Sequence(text_pred.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    a_enc = v.encodeSequence(a)
    b_enc = v.encodeSequence(b)
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(1, 0)
    aligner = GlobalSequenceAligner(scoring, 0)
    f, score, encodeds = aligner.align(a_enc,
                                       b_enc,
                                       text_ref.split(),
                                       text_pred.split(),
                                       backtrace=True)

    # get the first alignment if exists:
    #print(encodeds[0])
    print(encodeds)

    if len(encodeds[0]) > 0:
        alignment = v.decodeSequenceAlignment(encodeds[0])
        print(alignment)
        ##fix first and last missing words of asr text
        list_asr = []
        list_pred = []
        for word in text_pred.split():
            if word != alignment.second.elements[0]:
                list_asr.append(word)
                list_pred.append('-')
            else:
                alignment.second.elements = list_asr + alignment.second.elements
                alignment.first.elements = list_pred + alignment.first.elements
                break
        list_asr = []
        list_pred = []
        for word in reversed(text_pred.split()):
            if word != alignment.second.elements[-1]:
                list_asr = [word] + list_asr
                list_pred.append('-')
            else:
                alignment.second.elements = alignment.second.elements + list_asr
                alignment.first.elements = alignment.first.elements + list_pred
                break
        #fix first and last missing words of reference text
        list_asr = []
        list_pred = []
        for word in text_ref.split():
            if word != alignment.first.elements[0]:
                list_pred.append(word)
                list_asr.append('-')
            else:
                alignment.second.elements = list_asr + alignment.second.elements
                alignment.first.elements = list_pred + alignment.first.elements
                break
        list_asr = []
        list_pred = []
        for word in reversed(text_ref.split()):
            if word != alignment.first.elements[-1]:
                list_pred = [word] + list_asr
                list_asr.append('-')
            else:
                alignment.second.elements = alignment.second.elements + list_asr
                alignment.first.elements = alignment.first.elements + list_pred
                break
        #print(alignment.second.elements)
        #print(alignment.first.elements)
        print(alignment)
        rec = alignment.score * 100 / len(text_ref.split())
        pre = alignment.score * 100 / len(text_pred.split())
    else:
        alignment = []
        rec, pre = 0, 0

    return alignment, rec, pre
Пример #40
0
    def __iter__(self):
        return (int(e) for e in self.elements)


# Tests -----------------------------------------------------------------------

if __name__ == '__main__':
    s1 = Sequence('what a beautiful day'.split())
    s2 = Sequence('what a disappointingly bad day'.split())
    print('s1', s1)
    print('s2', s2)
    print('')

    from alignment.vocabulary import Vocabulary
    v = Vocabulary()
    e1 = v.encodeSequence(s1)
    e2 = v.encodeSequence(s2)
    print('v', v)
    print('e1', e1)
    print('e2', e2)
    print('')

    from alignment.sequencealigner import SimpleScoring
    from alignment.sequencealigner import GlobalSequenceAligner
    s = SimpleScoring(2, -1)
    a = GlobalSequenceAligner(s, -2)
    score, alignments = a.align(e1, e2, backtrace=True)
    for alignment in alignments:
        as1 = v.decodeSequence(alignment.first)
        as2 = v.decodeSequence(alignment.second)
Пример #41
0
    def test_utterance_transcriptions(self):
        print('Checking utterance transcriptions...')

        split_directory = self.corpus.split_directory()
        model_directory = self.trainer.align_directory
        with mp.Pool(processes=self.corpus.num_jobs) as pool:
            jobs = [(self, x)
                    for x in range(self.corpus.num_jobs)]
            results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Utterance FSTs compiled!')
            print('Decoding utterances (this will take some time)...')
            results = [pool.apply_async(test_utterances_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Finished decoding utterances!')

        word_mapping = self.dictionary.reversed_word_mapping
        v = Vocabulary()
        errors = {}

        for job in range(self.corpus.num_jobs):
            text_path = os.path.join(split_directory, 'text.{}'.format(job))
            texts = load_scp(text_path)
            aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job)))
            with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf:
                for utt, line in sorted(aligned_int.items()):
                    text = []
                    for t in line:
                        text.append(word_mapping[int(t)])
                    outf.write('{} {}\n'.format(utt, ' '.join(text)))
                    ref_text = texts[utt]
                    if len(text) < len(ref_text) - 7:
                        insertions = [x for x in text if x not in ref_text]
                        deletions = [x for x in ref_text if x not in text]
                    else:
                        aligned_seq = Sequence(text)
                        ref_seq = Sequence(ref_text)

                        alignedEncoded = v.encodeSequence(aligned_seq)
                        refEncoded = v.encodeSequence(ref_seq)
                        scoring = SimpleScoring(2, -1)
                        a = GlobalSequenceAligner(scoring, -2)
                        score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True)
                        insertions = []
                        deletions = []
                        for encoded in encodeds:
                            alignment = v.decodeSequenceAlignment(encoded)
                            for i, f in enumerate(alignment.first):
                                s = alignment.second[i]
                                if f == '-':
                                    insertions.append(s)
                                if s == '-':
                                    deletions.append(f)
                    if insertions or deletions:
                        errors[utt] = (insertions, deletions, ref_text, text)
        if not errors:
            message = 'There were no utterances with transcription issues.'
        else:
            out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv')
            with open(out_path, 'w') as problemf:
                problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n')
                for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(),
                                                                           key=lambda x: -1 * (
                                                                                   len(x[1][1]) + len(x[1][2]))):
                    problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions),
                                                             ' '.join(ref_text), ' '.join(text)))
            message = 'There were {} of {} utterances with at least one transcription issue. '\
                  'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path)

        print(self.transcription_analysis_template.format(message))
cur.execute(selectStatement2)
lyrics2 = cur.fetchone()

cur.execute(selectStatement3)
lyrics3 = cur.fetchone()

scoring = SimpleScoring(2, -2)
aligner = LocalSequenceAligner(scoring, -2)

a = Sequence(lyrics1[0].split(" "))
b = Sequence(lyrics2[0].split(" "))
c = Sequence(lyrics3[0].split(" "))

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)
cEncoded = v.encodeSequence(c)

print "RUN DMC VS BIGGIE SMALLS"

#Create a scoring and align sequences using the loacl aligner.
score, encodeds = aligner.align(aEncoded, cEncoded, backtrace=True)

#Iterate over optimal alignments and print them.
if
    alignment = v.decodeSequenceAlignment(encodeds[0])
    print alignment
    print 'Alignment score:', alignment.score
    print 'Percent identity:', alignment.percentIdentity()