def comparewebes(sntc1,sntc2):
    sntc1=sntc1.replace('+',' ')
    sntc2=sntc2.replace('+',' ')
    res=align(sntc1,sntc2,lang='spanish')
    prop_1=prop_al(res)    
    res=align(sntc2,sntc1)
    prop_2=prop_al(res)    
    sim=2*prop_1*prop_1/(prop_1+prop_2)

    return str(sim) 
Exemplo n.º 2
0
def verify():
    ipreg.grid_forget()
    global count,callCounter
    count=False
    callCounter=0
    opreg.grid(row=2,column=0,pady=(5,0))
    for i in range(500):
        _, frame = cap.read()
        gray=cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
        faces=face_detector.detectMultiScale(gray,1.1,5)
        eyes=[]
        if(np.any(faces)==False):
            print('No faces. Fail - ',i)
            continue
        for (x,y,w,h) in faces:
            roi=gray[y:y+h,x:x+w]
            eyes=eye_detector.detectMultiScale(roi,1.3,7)
        try:
            _=eyes[1]
        except (IndexError):
            print('No eyes. Fail - ',i)
            continue
        break
    name=recognizer(database,align(frame))
    opreg['text']='Identification - '+name
Exemplo n.º 3
0
def processAll(audio_text_folder, audio_file, text_file):
    ######################## IO Arguments Here ###########################
    syncmap_file = text_file
    main_title = text_file.replace('.txt', '')
    ######################################################################

    audio_file_path = os.path.join(audio_text_folder, audio_file)

    text_file_path = audio_text_folder
    oritext_file_path = os.path.join(text_file_path, text_file)
    splittext_file_path = os.path.join(text_file_path,
                                       text_file.replace('.txt', '_split.txt'))

    syncmap_file_path = os.path.join(audio_text_folder, 'syncmap_file')
    if not os.path.exists(syncmap_file_path):
        os.makedirs(syncmap_file_path)
    syncmap_file_path = os.path.join(syncmap_file_path, syncmap_file)

    out_dir = r'output directory here'
    out_dir = os.path.join(out_dir, main_title)
    if os.path.exists(out_dir):
        return
    os.mkdir(out_dir)
    out_dir = os.path.join(out_dir, main_title)

    print("Text processed: ", splittext(oritext_file_path))
    print("Aligned: ",
          align(audio_file_path, splittext_file_path, syncmap_file_path))
    print("Split: ",
          split(audio_file_path, syncmap_file_path, out_dir, main_title))
    print("Done.")
Exemplo n.º 4
0
def retText():
    global name,frame,count,callCounter,database
    name=ipregentry.get()
    print(name)
    modify_database(align(frame),name)
    database = np.load('./vitals/database.npy').item()
    ipregentry.delete(0,len(name))
    ipreg.grid_forget()
    opreg['text']='Identity Validated'
    opreg.grid(row=1,column=0,pady=(5,0))
    count=False
    callCounter=0
Exemplo n.º 5
0
 def align(self, lang1, lang2, no_hand=False):
     seq1 = self.get_sentences(lang1)
     seq2 = self.get_sentences(lang2)
     if no_hand:
         a = aligner.align(seq1, seq2)
     else:
         hand_alignment = self.get_alignment([lang1, lang2]).as_ladder(with_costs=False)
         print >> log, "%d hand-aligned pairs found." % len(hand_alignment)
         a = aligner.make_composed_alignment(seq1, seq2, hand_alignment)
     output_filename = self._p('%s-%s.my' % (lang1, lang2))
     Alignment(a).dump(output_filename)
     print >> log, "Wrote %s." % output_filename
     return a
Exemplo n.º 6
0
def get_alignment_complexity_scores(s0, s1):
    """
    Run Sultan's aligner on two sentences and return the list that for each word in the first
    sentence specifies whether it was changed/simplified (1), kept unchanged (2) or cannot be
    linked to any other word in the sentence (0).
    :param s0: the first sentence as a list of tokens
    :param s1: the second sentence as a string
    :return:   see above
    """
    s0 = [x.lower() for x in s0]
    s1 = s1.lower()

    # check if the alignment has been performed before
    dict_key = " ".join(s0) + SEPARATOR + s1
    if dict_key in ALIGN_DICT:
        return ALIGN_DICT[dict_key]

    result = np.full(len(s0), UNK)
    ALIGNMENT_STATS["total"] += 1

    try:
        # tokenize and lemmatize the sentences
        s0_tok = tokenize(" ".join(s0))
        s1_tok = tokenize(s1)
        s0_lem = lemmatize(s0_tok)
        s1_lem = lemmatize(s1_tok)
        pairs = align(s0_tok, s1_tok)  # pairs of sentences aligned by Sultan's word aligner
    except:
        ALIGN_DICT[dict_key] = result
        ALIGNMENT_STATS["unsuccessful"] += 1
        return result

    # iterate over aligned pairs and feel the result array
    for i in range(len(pairs[0])):
        w0, w1 = pairs[1][i][0].lower(), pairs[1][i][1].lower()
        if w0 in STOPWORDS or w1 in STOPWORDS:  # such an alignment doesn't matter
            continue
        if w0 == w1 or s0_lem.get(w0, 'w0') == s1_lem.get(w1, 'w1'):
            # the alignment is valid but it only indicates that the word was kept as it is
            id = get_index(s0, w0, i, pairs)
            if id == -1:
                continue
            result[id] = SIMPLE
        else:
            id = get_index(s0, w0, i, pairs)
            if id == -1:
                continue
            result[id] = COMPLEX

    ALIGN_DICT[dict_key] = result
    return result
Exemplo n.º 7
0
Arquivo: models.py Projeto: FTAsr/STS
    def pairFeatures(self, sentenceA, sentenceB):

        features = []
        sentA = re.findall(r"[\w]+", sentenceA)
        sentB = re.findall(r"[\w]+", sentenceB)

        numerator = 0
        denominator = len(sentA) + len(sentB)

        alignedWords = aligner.align(sentA, sentB)

        for sentenceId in range(0, len(alignedWords)):
            numerator += len(alignedWords[sentenceId])

        features.append(float(numerator) / float(denominator))
        features.append(self.bow.sentence_similarity(sentenceA, sentenceB))

        return features
Exemplo n.º 8
0
def learn_weights(training_set, learning_epochs, burn_in_epochs,
learning_rate, learning_rate_multiplier):
    weights = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 
        ]
    weights_history = []

    for i in range(learning_epochs):
        print '*** Starting epoch %s of %s ***' % (i, learning_epochs)
        learning_rate *= learning_rate_multiplier
        #logging.warning('Starting epoch %s with learning rate %s' %
        #(i, learning_rate))
        shuffle(training_set)

        for index, problem in enumerate(training_set):
            print '* Starting problem %s of %s in epoch %s*' % \
                (index, len(training_set), i)
            print problem.p_str_tokens
            print problem.h_str_tokens
            gold_features = gold_featurizer.featurize(problem)
            #logging.warning('\nStarting weights:\n%s' % weights)

            predicted_alignment, predicted_features = aligner.align(
                    problem.p_str_tokens, problem.h_str_tokens, weights)

            print predicted_features
            weights = weights + (learning_rate *
            (gold_features - predicted_features))
            #logging.warning('Summed rated weights:\n%s' % weights)

        weights = weights / sqrt(sum([i ** 2 for i in weights]))
        #logging.warning('L2 normalization:\n%s' % weights)
        weights_history.append(weights)
        #logging.warning('\n\nWeights history:\n%s' % weights_history)

    weights_averaged = 1 / (learning_epochs
    - burn_in_epochs) * sum(weights_history[burn_in_epochs:])
    return weights_averaged
Exemplo n.º 9
0
def linear_align(s1, s2):
    tokens1 = s1.split(' ')
    tokens2 = s2.split(' ')
    alignments = sorted([(a1-1, a2-1) for a1, a2 in align(s1, s2)[0]])
    equals = set([(a1, a2) for a1, a2 in alignments  if(a1<len(tokens1) and a2<len(tokens2) and tokens1[a1] == tokens2[a2])])

    new_alignments = []
    current = []
    for a1, a2 in alignments:
        if ((a1, a2) in equals) and (len(current) == 0 or current[-1] == (a1-1, a2-1)):
            current.append((a1, a2))
        else:
            if len(current) > 0:
                seq1, seq2 = zip(*current)
                new_alignments.append((tuple(map(str, seq1)), tuple(map(str, seq2))))
            new_alignments.append(((str(a1),), (str(a2),)))
            current = []
    if len(current) > 0:
        seq1, seq2 = zip(*current)
        new_alignments.append((tuple(map(str, seq1)), tuple(map(str, seq2))))

    return new_alignments
Exemplo n.º 10
0
def read_audio(audio_file, transcript):
    args = ["pocketsphinx_continuous", "-time", "yes", "-infile", audio_file]
    out = subprocess.check_output(args, stderr=DEVNULL)
    is_text = True
    reconized_text = ""
    words = []

    # Parse ugly output
    for line in out.split("\n"):
        if "!!!" in line:
            continue
        if "<s>" in line:
            is_text = False
        if is_text:
            reconized_text += " " + line
        if "</s>" in line:
            is_text = True
        if not is_text:
            data = line.split(" ")
            word = data[0].strip("(0123456789)")
            start = float(data[1])
            end = float(data[2])
            if word.isalpha():
                words.append((word, start, end))

    # Remove unreconized word
    known_words = align(reconized_text.upper(), transcript)
    i = 0
    res_words = []
    for word in known_words:
        if i == len(words):
            break
        while i < len(words) and word != words[i][0].upper():
            i += 1
        if i < len(words):
            res_words.append(words[i])

    return res_words
Exemplo n.º 11
0
def read_audio(audio_file, transcript):
    args = ['pocketsphinx_continuous', '-time', 'yes', '-infile', audio_file]
    out = subprocess.check_output(args, stderr=DEVNULL)
    is_text = True
    reconized_text = ""
    words = []

    # Parse ugly output
    for line in out.split('\n'):
        if '!!!' in line:
            continue
        if '<s>' in line:
            is_text = False
        if is_text:
            reconized_text += " " + line
        if '</s>' in line:
            is_text = True
        if not is_text:
            data = line.split(' ')
            word = data[0].strip('(0123456789)')
            start = float(data[1])
            end = float(data[2])
            if word.isalpha():
                words.append((word, start, end))

    # Remove unreconized word
    known_words = align(reconized_text.upper(), transcript)
    i = 0
    res_words = []
    for word in known_words:
        if i == len(words):
            break
        while i < len(words) and word != words[i][0].upper():
            i += 1
        if i < len(words):
            res_words.append(words[i])

    return res_words
Exemplo n.º 12
0
    #normalisation and data format changed to channels_first
    img = np.around(np.transpose(img, (2, 0, 1)) / 255.0, decimals=15)

    #preprocess data format
    x_train = np.array([img])

    #feed to neural net input
    embedding = model.predict_on_batch(x_train)

    return embedding


#driver code
if __name__ == "__main__":
    print('Initialised Model')
    print('Reading reference image')
    #replace with full reference image path. example - 'D:/img/image_1.jpg'
    image = cv2.imread('Reference Image Path', 1)
    image = align(image)
    cv2.imshow('Labelled Image', image)
    print('Generating embedding')
    database = modify_database(image, 'Prabodh')
    print('Embedding generated')
    print('Reading test image')
    #replace with full test image path. example - 'D:/img/image_2.jpg'
    image = cv2.imread('Test Image Path', 1)
    image = align(image)
    cv2.imshow('Test Image', image)
    print('Recognition started...')
    recognizer(database, image)
def alignweb(sntc1,sntc2):
    res={'result':align(sntc1,sntc2)}
    return jsonify(res) 
Exemplo n.º 14
0
# coding=utf-8
# run.py

import aligner as fa
import formants as fm

fa.dict()
fa.align()
fm.extract()
    sent1_parse_lst = read_json_file(args.sent1parsepath)
    sent2_parse_lst = read_json_file(args.sent2parsepath)

    if args.sentalignspath is None:  # assume 1-to-1 alignment
        sent_aligns = [
            '{}\t{}'.format(i, i) for i in range(0, len(sent1_parse_lst))
        ]
    else:
        sent_aligns = read_text_file(args.sentalignspath)

    sents_info = group_sentence_alignments(sent1_parse_lst, sent2_parse_lst,
                                           sent_aligns)

    word_aligns = []
    for sent1_parse_json, sent2_parse_json in sents_info:
        sent1_parse_result = coreNlpUtil.format_json_parser_results(
            sent1_parse_json)
        sent2_parse_result = coreNlpUtil.format_json_parser_results(
            sent2_parse_json)
        # get the alignments (only indices)
        aligns, _ = aligner.align(sent1_parse_result, sent2_parse_result)
        # convert to pharaoh format: [[1, 1], [2, 2]] -> ['1-1', '2-2']
        aligns_pharaoh = ['-'.join([str(p[0]), str(p[1])]) for p in aligns]
        # create a single line to write: ['1-1', '2-2'] -> '1-1 2-2'
        aligns_line = ' '.join(aligns_pharaoh)
        word_aligns.append(aligns_line)

    aligns_file_path = os.path.join(args.outputfolder, args.outputfilename)
    with open(aligns_file_path, 'w') as aligns_file_path:
        aligns_file_path.write('\n'.join(word_aligns))
from aligner import align

from model.utils import get_dataset, get_tokenized_lemmas


def _get_unaligned_tokens(tokens, alignment):
    aligned = [a - 1 for (a, _) in alignment]
    unaligned = [i for i in range(len(tokens)) if i not in aligned]
    return [tokens[i] for i in unaligned]


if __name__ == "__main__":
    df = get_dataset()
    data = {}

    for id, row in df.iterrows():
        article_hl_tok = get_tokenized_lemmas(row.articleHeadline)
        claim_hl_tok = get_tokenized_lemmas(row.claimHeadline)
        try:
            alignment = align(claim_hl_tok, article_hl_tok)
            data[(row.claimId, row.articleId)] = [(s - 1, t - 1)
                                                  for (s, t) in alignment[0]]
        except:
            print 'Unable to align', article_hl_tok, 'and', claim_hl_tok
            print row.articleId, row.claimId

    with open(os.path.join('..', 'data', 'pickled', 'aligned-data.pickle'),
              'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
    import pickle

from aligner import align

from model.utils import get_dataset, get_tokenized_lemmas


def _get_unaligned_tokens(tokens, alignment):
    aligned = [a-1 for (a, _) in alignment]
    unaligned = [i for i in range(len(tokens)) if i not in aligned]
    return [tokens[i] for i in unaligned]


if __name__ == "__main__":
    df = get_dataset()
    data = {}

    for id, row in df.iterrows():
        article_hl_tok = get_tokenized_lemmas(row.articleHeadline)
        claim_hl_tok = get_tokenized_lemmas(row.claimHeadline)
        try:
            alignment = align(claim_hl_tok, article_hl_tok)
            data[(row.claimId, row.articleId)] = [(s-1, t-1) for (s, t) in alignment[0]]
        except:
            print 'Unable to align', article_hl_tok, 'and', claim_hl_tok
            print row.articleId,  row.claimId

    with open(os.path.join('..', 'data', 'pickled', 'aligned-data.pickle'), 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)