Exemplo n.º 1
0
def HardEM(file, outcome):
    words = loadWords(file)
    T = loadT("T")
    iniT = initTransfer(T)  #随机初始化转移概率
    iniE = initEmission(T, words)  #随机初始化发射概率
    sentences = loadData(file)

    Y = []
    for i in sentences:
        Y.append(viterbi(i, iniT, iniE))

    print(sentences[50])
    print(Y[50])

    for i in range(maxLoop):
        print(i)
        new_T = transfer(Y)
        new_E = emission(sentences, Y)
        new_Y = []
        for j in sentences:
            new_Y.append(viterbi(j, new_T, new_E))
        print(sentences[50])
        print(Y[50])
        if diff(Y, new_Y) < threshold:
            Y = new_Y
            break
        Y = new_Y
    with open(outcome, mode='w', encoding="utf-8") as output:
        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                output.write(sentences[i][j] + ' ' + Y[i][j] + ' \n')
Exemplo n.º 2
0
def Perceptron(s):
    global alpha_emision
    global alpha_trans
    tags = ["GENE","NOGENE","*","STOP"]
    n = len(tags)
    z = viterbi.viterbi(s,alpha_emision,alpha_trans, tags, n)
    c1 = {} 
    c2 = {} 
    c3={}
    c4 = {}
    #sequence using viterbi on train and raw train tags
    Viterbiseq ={}
    for i in range(1,len(z)-1):
        Viterbiseq[trigram(tags[z[i]],tags[z[i+1]],tags[z[i+1]])] = 0
    tagSeq = {}
    t1=[]
    for i in s:
        t =i.split(" ")
        t1.append(t[1])
    for i in range(len(t1)-2):
        tagSeq[trigram(t1[i],t1[i+1],t1[i+2])] = 0
    # print(tagSeq)
    # print(Viterbiseq)
    for i in range(1,len(s)):        
        for u in range(n):
            for v in range(n):
                for w in range(n):                                      
                        c1[trigram(tags[w],tags[u],tags[v])] = 0
                        c2[trigram(tags[w],tags[u],tags[v])] = 0
    for i in range(1,len(s)):        
        for u in range(n):
            for v in range(n):
                for w in range(n):                                      
                    if trigram(tags[w],tags[u],tags[v]) in tagSeq :
                        c2[trigram(tags[w],tags[u],tags[v])] += 1
                    if trigram(tags[w],tags[u],tags[v]) in Viterbiseq :
                        c1[trigram(tags[w],tags[u],tags[v])] += 1
                    if c1.get(trigram(tags[w],tags[u],tags[v])) != c2.get(trigram(tags[w],tags[u],tags[v])):
                        alpha_trans[tags[w],tags[u],tags[v]] += c1.get(trigram(tags[w],tags[u],tags[v]))-c2.get(trigram(tags[w],tags[u],tags[v]))
    wordTagSeq={}
    wordTagVit={}           
    for i in range(1,len(z)):
        t =s[i-1].split(" ")
        wordTagSeq[words(t[1],t[0])]=0
        wordTagVit[words(tags[z[i]],t[0])]=0
    # print(wordTagSeq)
    # print("\n")
    # print(wordTagVit)
    for i in s:
        for j in tags: 
            c3[words(j,i)] =0
            c4[words(j,i)] =0
    for i in s:
        for j in tags: 
            if words(j,i) in  wordTagSeq:
                c3[words(j,i)] += 1
            if words(j,i) in  wordTagVit:
                c4[words(j,i)] += 1
            if c3.get(words(j,i)) != c4.get(words(j,i)):
                alpha_emision+= c3.get(words(j,i)) - c4.get(words(j,i)) 
Exemplo n.º 3
0
def cwsSent(sent, model, cwsInfo):
    (initProb, tranProb), (vocab, indexVocab) = cwsInfo
    vec = pp.sent2vec2(sent, vocab, ctxWindows=5)
    vec = np.array(vec)
    # 根据输入得到 [B,E,M,S] 标注推断
    # 按batch产生输入数据属于各个类别的概率
    emit_P = model.predict_proba(vec)
    # 按batch产生输入数据的类别预测结果
    classes = model.predict_classes(vec)
    # print(classes)  # [3 0 2 3 0 1 2 3 0 1 2 3]

    prob, path = viterbi.viterbi(vec, tags, initProb, tranProb,
                                 emit_P.transpose())
    assert len(path) == len(sent)

    result = ''
    for i, t in enumerate(path):
        if tags[t] == 'B':
            result += sent[i]
        elif tags[t] == 'M':
            result += sent[i]
        elif tags[t] == 'E':
            result += sent[i] + ' '
        else:
            result += sent[i] + ' '
    return result.strip()
Exemplo n.º 4
0
def start_tagging(filename, outname, outpath, sepnum=50, test=False):
    if not os.path.exists(outpath):
        os.makedirs(outpath)

    file_type = "測試" if test else "驗證"

    print("========== " + file_type + "集標注 ==========")
    print("載入模型參數...")
    pi, transition, emission = load_model('model')

    print("載入模型參數...")
    print("讀取" + file_type + "集進行HMM+Viterbi標註...")
    with open(filename, 'r', encoding='utf-8-sig') as fr:
        for line in fr:
            sent_list = []
            sent_temp = ""
            line = line.rstrip()
            for car in line:
                if car != '。':
                    sent_temp = sent_temp + car
                elif car == '。':
                    sent_temp = sent_temp + car
                    if len(sent_temp) > sepnum:
                        f = math.floor(len(sent_temp) / sepnum)
                        m = len(sent_temp) % sepnum
                        i = 0
                        while i < f:
                            sent_list.append(sent_temp[sepnum * i:sepnum *
                                                       (i + 1)])
                            i += 1
                        if m != 0:
                            sent_list.append(sent_temp[sepnum * (i):(sepnum *
                                                                     (i)) + m])
                        sent_temp = ""
                    else:
                        sent_list.append(sent_temp)
                        sent_temp = ''

            if sent_temp != '':
                sent_list.append(sent_temp)

            seg = []
            # HMM, 維特比分詞
            for sent in sent_list:
                pos_list = viterbi(sent, pi, transition, emission)

                # 將標註好的結果套用回原本的句子裡
                seglist = cut(sent, pos_list)
                seg.append(seglist)

            # 追加寫回文本中
            with open(outpath + '/' + outname + '.txt',
                      'a',
                      encoding='utf-8-sig') as outp:
                for line in seg:
                    for word in line:
                        outp.write(word + ' ')
                outp.write('\n')
    print("========== " + file_type + "集標注完成 ==========")
    print()
Exemplo n.º 5
0
def tpl_model(pinyin):
    for p in pinyin:
        if p not in py2ch:
            return 'wrong spelling!!!!!'
    if len(pinyin) < 3:
        return dbl_model(pinyin)
    weights = []
    ch_pairs = get_pairs(pinyin[0], pinyin[1])
    xs = np.array([freq_dbl(c) for c in ch_pairs])

    win = pinyin[:2]
    for i, p in enumerate(pinyin[2:]):
        win.append(p)  # window of 3 ch
        w = np.ndarray((len(py2ch[win[0]]) * len(py2ch[win[1]]),
                        len(py2ch[win[1]]) * len(py2ch[win[2]])))
        for ip, prev in enumerate(get_pairs(win[0], win[1])):
            for ic, cur in enumerate(get_pairs(win[1], win[2])):
                if not cur[0] == prev[1]:
                    w[ip][ic] = 0
                else:
                    w[ip][ic] = freq_cond_tpl(prev + cur[1])
        weights.append(w)
        win = win[1:]

    path = viterbi.viterbi(xs, weights)

    ans = get_pairs(pinyin[0], pinyin[1])[path[0]][0]
    for i, p in enumerate(path):
        ans += get_pairs(pinyin[i], pinyin[i + 1])[p][1]
    return ans
Exemplo n.º 6
0
def decode(word, modelfile, verbose=False, letters=False):
    if os.path.getsize(modelfile) > 0:
        with open(modelfile, "rb") as data:
            unpickler = pickle.Unpickler(data);
            d = unpickler.load()
    else:
        print("Error reading model file")
        return None
        #d = pickle.load(data)
    states = d[0]
    start_p= d[1]
    trans_p = d[2]
    emit_p = d[3]
    word = list(word)
    if verbose:
        print("Segment word:", word)

    estimate = viterbi(word, states, start_p, trans_p, emit_p)

    final = ""
    if not letters:
        for i in range(len(estimate)):
            if estimate[i] == 'w':
                final = final + word[i]
            elif estimate[i] == '|':
                final = final + word[i]
                final = final + ' ' 
            else:
                break
    else:
        final = "".join(estimate)

    if verbose:
        print(estimate)
    print(final)
Exemplo n.º 7
0
    def __init__(self, s):
        self.c1 = {}
        self.c2 = {}
        self.c3 = {}
        self.c4 = {}
        global trigram
        global words
        global tags
        global n
        self.z = viterbi.viterbi(s, alpha_emision, alpha_trans, tags, n)
        self.wordTagSeq = {}
        self.wordTagVit = {}
        self.Viterbiseq = {}
        # print(self.z)

        for i in range(len(self.z) - 2):
            self.Viterbiseq[trigram(tags[self.z[i]], tags[self.z[i + 1]],
                                    tags[self.z[i + 2]])] = 0

        self.tagSeq = {}
        t1 = []
        for i in s:
            t = i.split(" ")
            t1.append(t[1])
        for i in range(len(s) - 2):
            self.tagSeq[trigram(t1[i], t1[i + 1], t1[i + 2])] = 0
        # print(self.Viterbiseq, "--> viterbi seq")
        # print(self.tagSeq,"--> ti seq")

        for i in range(1, len(self.z)):
            t = s[i - 1].split(" ")
            self.wordTagSeq[words(t[1], t[0])] = 0
            self.wordTagVit[words(tags[self.z[i]], t[0])] = 0
Exemplo n.º 8
0
 def viterbi_decode(self, seq):
     node_potentials, edge_potentials = self.build_potentials(seq)
     viterbi_path, _ = viterbi(node_potentials, edge_potentials)
     res = viterbi_path
     new_seq = seq.copy_sequence()
     new_seq.y = res
     return new_seq
Exemplo n.º 9
0
def cwsSent(sent, model, cwsInfo):
    (initProb, tranProb), (vocab, indexVocab) = cwsInfo
    vec = cws.sent2vec(sent, vocab, ctxWindows=7)
    vec = np.array(vec)
    probs = model.predict_proba(vec)
    #classes = model.predict_classes(vec)

    prob, path = viterbi.viterbi(vec, cws.corpus_tags, initProb, tranProb,
                                 probs.transpose())

    ss = ''
    for i, t in enumerate(path):
        ss += '%s/%s ' % (sent[i], cws.corpus_tags[t])
    ss = ''
    word = ''
    for i, t in enumerate(path):
        if cws.corpus_tags[t] == 'S':
            ss += sent[i] + ' '
            word = ''
        elif cws.corpus_tags[t] == 'B':
            word += sent[i]
        elif cws.corpus_tags[t] == 'E':
            word += sent[i]
            ss += word + ' '
            word = ''
        elif cws.corpus_tags[t] == 'M':
            word += sent[i]

    return ss
Exemplo n.º 10
0
def main():
    print("hello world")

    fasta_list = h.process_fasta(c.genome_file, c.fna_exten)
    seq = h.get_seq_list(fasta_list)[0]

    ginfo_list = h.process_gff(c.genome_file, c.gff_exten)
    ginfo_list = sorted(ginfo_list, key=lambda ginfo: ginfo.start)

    viterbi_output = open(c.results_folder + "viterbi" + c.text_exten, "wt")
    viterbi_intervals = v.viterbi(seq, viterbi_output)
    viterbi_output.close()

    viterbi_eval_output = open(
        c.results_folder + "viterbi_eval" + c.text_exten, "wt")
    evaluate(viterbi_intervals, ginfo_list, viterbi_eval_output)
    viterbi_eval_output.close()

    baum_welch_output = open(c.results_folder + "baum_welch" + c.text_exten,
                             "wt")
    baum_welch_intervals = bw.baum_welch(seq, baum_welch_output)
    baum_welch_output.close()

    baum_welch_eval_output = open(
        c.results_folder + "baum_welch_eval" + c.text_exten, "wt")
    evaluate(baum_welch_intervals, ginfo_list, baum_welch_eval_output)
    baum_welch_eval_output.close()

    print("done")
Exemplo n.º 11
0
def evaluate(model, examples, gold, label=None):
    output = list(model.predict({'input': examples },
                                batch_size=config.batch_size)['output'])
    pred = np.argmax(np.asarray(output), axis=2).flatten()
    vpred = viterbi.viterbi(np.concatenate(output), *viterbi_probabilities)
    return (common.classification_summary(gold, pred) + '\n' +
            'w/viterbi ' + common.classification_summary(gold, vpred))
Exemplo n.º 12
0
def seg(inp):
	"segmenter main function"
	tail = ""
	if len(inp) % 2 != 0:
		tail = inp[-1]
		inp = inp[:-1]
		
	#load wubimap
	for line in wubi:
		wubi_map[line[0]] = int(line[1])
	
	#all segments here
	seg = []
	all_seg(inp,seg,0,tail)
	
	#find viterbi path every segment
	vit = []
	for segline in seg:
		ans = vt.viterbi(segline,uni_map, big_map, wd_map, wubi)
		if not ans == None:
			#ans[1] = ans[1] - 20*len(ans[0])
			vit.append(ans)
	
	vit.sort(key=lambda path: path[1], reverse=False)
	
	for v in vit:
		#print(v)
		pass
	
	#return max viterbi path
	if len(vit) > 0:
		return vit[-1][0]
	else:
		return False
Exemplo n.º 13
0
def pinyin(input_path, output_path):
	fin = codecs.open(input_path)
	fout = codecs.open(output_path, 'w')
	
	# load initial probability & transition probability
	print('loading data...')
	path = '../data/init_prob.json'
	with codecs.open(path) as f:
		init_prob = json.load(f)

	word_list = list(init_prob.keys())

	path = '../data/trans_prob.json'
	with codecs.open(path) as f:
		trans_prob = json.load(f)

	# apply viterbi
	for line in fin:
		if line[-1] == '\n':
			line = line[:-1]
		#print(line)
		fout.write(viterbi.viterbi(init_prob, trans_prob, line))

	fin.close()
	fout.close()
Exemplo n.º 14
0
 def viterbi_decode(self,seq):
     node_potentials,edge_potentials = self.build_potentials(seq)
     viterbi_path,_ = viterbi(node_potentials,edge_potentials)
     res =  viterbi_path
     new_seq =  seq.copy_sequence()
     new_seq.y = res
     return new_seq
Exemplo n.º 15
0
def evaluate():
    global possible_tags
    global strings
    global cca_length
    get_words()
    get_strings()
    get_alpha()
    get_phi()
    get_regExp()
    #    get_codeWords()
    get_cca()
    #    cca_length = len(cca1['amended'])
    cca_length = 20
    data = open('inputs/eng.test{0}'.format(sys.argv[2]), 'r')
    s = 'outputs_cca_pos_egw30_rounding_currentOnly/result_{0}_{1}.txt'.format(
        sys.argv[2], sys.argv[1])
    output = open(s, 'w')
    line = data.readline()
    output.write('{0}\n\n'.format(line.strip()))
    line = data.readline()
    vals = get_sentence(data)
    sentence = vals[0]
    correct_tags = vals[1]
    POS = vals[2]
    count = 0
    time1 = 0.0
    time2 = 0.0
    avg_time = 0.0
    time_val = 0.0
    first = True
    while sentence:
        #------------------------
        #-------TIME-STATS-------
        #------------------------
        count += 1
        time2 = time()
        if not first:
            avg_time = (avg_time * (count - 1) + (time2 - time1)) / count
            time_val = int((avg_time) * (number_of_sentences - count))
        first = False
        progress = open('progress_test.txt', 'w')
        progress.write(
            'Percent complete:\n{0}/{1} = {2}%\n\nTime remaining: \n{3} h {4} min {5} sec'
            .format(int(count), int(number_of_sentences),
                    float(count * 100) / float(number_of_sentences),
                    time_val / 3600, (time_val % 3600) / 60, time_val % 60))
        time1 = time2
        progress.close()
        #--------------------------
        #--------------------------
        tags = viterbi.viterbi(sentence, POS, phi, possible_tags, alpha,
                               strings, Words, regExp, codes, cca1, cca_length)
        for i in range(len(sentence)):
            output.write('{0} {1} {2} {3}\n'.format(sentence[i], POS[i][0],
                                                    correct_tags[i], tags[i]))
        output.write('\n')
        vals = get_sentence(data)
        sentence = vals[0]
        correct_tags = vals[1]
        POS = vals[2]
Exemplo n.º 16
0
def learning(T):
    weight = defaultdict(lambda : uniform(-1.0, 1.0))
    data = []
    possible_tags, transition = set(["<s>", "</s>"]), set()
    for line in iter(sys.stdin.readline, ""):
        X, Y = [], []
        pre_y = "<s>"
        for x_y in line.rstrip().split():
            (x, y) = x_y.split('_')
            X.append(x)
            Y.append(y)
            possible_tags.add(y)
            transition.add(" ".join([pre_y, y]))
            pre_y = y
        transition.add(" ".join([pre_y, "</s>"]) )
        data.append((X, Y))
    data_size = len(data)
    for t in range(T):
        for line_num, (X, Y_prime) in enumerate(data):
            sys.stdout.write("\rIteration %d, linenum %d / %d" % (t+1, line_num+1, data_size))
            sys.stdout.flush()
            Y_hat = viterbi(weight, X, possible_tags, transition)
            phi_prime = create_feature(X, Y_prime)
            phi_hat = create_feature(X, Y_hat)
            update_weight(weight, phi_prime, phi_hat)
    return (weight, possible_tags, transition)
Exemplo n.º 17
0
def learning(T):
    weight = defaultdict(lambda: uniform(-1.0, 1.0))
    data = []
    possible_tags, transition = set(["<s>", "</s>"]), set()
    for line in iter(sys.stdin.readline, ""):
        X, Y = [], []
        pre_y = "<s>"
        for x_y in line.rstrip().split():
            (x, y) = x_y.split('_')
            X.append(x)
            Y.append(y)
            possible_tags.add(y)
            transition.add(" ".join([pre_y, y]))
            pre_y = y
        transition.add(" ".join([pre_y, "</s>"]))
        data.append((X, Y))
    data_size = len(data)
    for t in range(T):
        for line_num, (X, Y_prime) in enumerate(data):
            sys.stdout.write("\rIteration %d, linenum %d / %d" %
                             (t + 1, line_num + 1, data_size))
            sys.stdout.flush()
            Y_hat = viterbi(weight, X, possible_tags, transition)
            phi_prime = create_feature(X, Y_prime)
            phi_hat = create_feature(X, Y_hat)
            update_weight(weight, phi_prime, phi_hat)
    return (weight, possible_tags, transition)
Exemplo n.º 18
0
    def prediction_structured(features, edge_feat):
      features  = features[np.newaxis, :, np.newaxis, :]
      edge_feat = edge_feat[np.newaxis, :, np.newaxis, :]

      unary_lgts = session.run(unary_logits, feed_dict={unary_features: features})
      edge_lgts = session.run(edge_logits, feed_dict={edge_features: edge_feat})

      return viterbi(unary_lgts.reshape([-1,2]), edge_lgts.reshape([-1,4]), lam=lamb)
Exemplo n.º 19
0
def tagging(set_,tags,word_tag):
    global full_tags
    global init_table
    global full_cpd_tags
    global full_cpd_word_tag
    tagset=[]
    for i in range (0,len(set_)):
        tagset.append(viterbi(set_[i], set(full_tags), init_table, tags, word_tag))
    return combine_tag_word(set_,tagset)
Exemplo n.º 20
0
 def decode(self, initials):
     timer = Timer()
     states = set()
     for obs in initials:
         states.update(self.words_by_letter[obs])
     logger.info("Searching %s possible states", len(states))
     result = viterbi.viterbi(initials, states, self.start_p, self.transition_p, self.emission_p)
     logger.info("Decoding %r took %s s", initials, timer.elapsed())
     return result
Exemplo n.º 21
0
def IBM1_VB(e, f, Lambda, nr_it=10, alpha=0.01):

    aer_values = []
    elbo_values = []

    # load test set
    count_e = count_words(e)
    count_f = count_words(f)
    theta = init_lexicon(e, f, init="uniform")

    [e_val, f_val] = load_train('data', 'test')
    e_val, f_val = replace_singletons(e_val, count_e), replace_singletons(
        f_val, count_f)

    print('--Performing EM--')
    for it in range(nr_it):
        print('Expectation...')
        count_f_e = defaultdict(lambda: defaultdict(float))

        for sentnr, (e_sent, f_sent) in enumerate(zip(e, f)):
            if sentnr % 10000 == 0:
                print('#sent:', sentnr)
            for f_w in f_sent:
                sum_pi_t = sum([theta[e_word][f_w] for e_word in e_sent])
                for e_w in e_sent:
                    pi_t = theta[e_w][f_w]

                    # Update counts
                    count_f_e[e_w][f_w] += pi_t / sum_pi_t

        print('Maximization')
        for e_w, f_words in Lambda.items():
            X = digamma(sum(Lambda[e_w].values()))
            for f_w, p in f_words.items():
                Lambda[e_w][f_w] = alpha + count_f_e[e_w][f_w]
                theta[e_w][f_w] = math.exp(digamma(Lambda[e_w][f_w]) - X)

        #TODO: calculate ELBO
        elbo = calculate_elbo(e, f, count_e, count_f, theta, alpha)
        print('ELBO:', elbow)
        elbo_values.append(elbo)

        # Create NAACL file for current run
        output_naacl(viterbi(e_val, f_val, theta),
                     'AER/naacl_IBM1VB_it{}.txt'.format(it + 1))
        aer_values.append(
            cmdline(
                'perl data/testing/eval/wa_eval_align.pl data/testing/answers/test.wa.nonullalign AER/naacl_IBM1VB_it{}.txt'
                .format(it + 1)))
        os.system(
            'perl data/testing/eval/wa_eval_align.pl data/testing/answers/test.wa.nonullalign AER/naacl_IBM1VB_it{}.txt'
            .format(it + 1))

    # pickle.dump(elbo_values, open( "ELBO_IBM_VI.p", "wb" ) )
    pickle.dump(aer_values, open("AER_IBM_VI.p", "wb"))
    return theta, elbo_values
Exemplo n.º 22
0
def tune_params(sentences):
    shuffle(sentences)

    tags = get_tags(sentences)

    for delta in [7, 5, 4, 3, 2, 1, 0.8]:
        for sigma in [0.07, 0.05, 0.04, 0.03, 0.02, 0.01, 0.008]:
            print('Delta, Sigma:', delta, sigma)

            accuracy_results = []
            precision_results = []
            recall_results = []

            for i in range(5):
                # print('Cross-validation:', i)
                training, testing = split_data(sentences, i)

                # print('Training size:', len(training))
                # print('Testing size:', len(testing))

                trs_model, ems_model = build_models(training, delta, sigma)

                map_unk = lambda s: [
                    w if w in ems_model.tokens else 'UNK' for w in s
                ]

                # testing = testing[:5]
                preds = [
                    viterbi(trs_model, ems_model, map_unk(s.words))[0]
                    for s in testing
                ]

                labels = [s.tags for s in testing]

                accuracy, precisions, recalls = eval_metric(
                    tags, preds, labels)

                accuracy_results.append(accuracy)
                precision_results.append(precisions)
                recall_results.append(recalls)

            print('accuracy:', round(mean(accuracy_results), 4))

            precisions, recalls = {}, {}
            for t in precision_results[0].keys():
                precisions[t] = mean([p[t] for p in precision_results])
                recalls[t] = mean([r[t] for r in recall_results])

            print('precisions:', round(mean(precisions.values()), 4))
            print('recalls:', round(mean(recalls.values()), 4))

            print('NN:', round(precisions['NN'], 4), round(recalls['NN'], 4))
            print('VB:', round(precisions['VB'], 4), round(recalls['VB'], 4))
            print('JJ:', round(precisions['JJ'], 4), round(recalls['JJ'], 4))
            print('NNP:', round(precisions['NNP'], 4),
                  round(recalls['NNP'], 4))
Exemplo n.º 23
0
def predictCategory(hmm, stc):
    # preprocess: split and lower sentence
    words = nltk.word_tokenize(stc)
    words = [word.lower() for word in words]
    # use DP to max observed prob
    pos = viterbi(*paras, stc=words)
    print('HMM tagged:\n', pos)
    # use nltk
    taggedWords = nltk.pos_tag(words, tagset='universal')
    print('NLTK tagged:\n', taggedWords)
Exemplo n.º 24
0
def t_BMES():
    PI, A, B = build()
    S = B.keys()
    for k in S:
        if k not in PI:
            PI[k] = 0.0
    for sen in samples:
        Y = tuple(sen)
        prob, X = viterbi(Y, S, PI, A, B)
        print u''.join(sen[i] + (X[i] in 'ES' and '|' or '') for i in xrange(len(sen)))
Exemplo n.º 25
0
def main(args):
    train_set = load_dataset(args.training_file, args.case_sensitive)
    test_set = load_dataset(args.test_file, args.case_sensitive)
    if args.baseline:
        print("You are running the baseline algorithm!")
        accuracy = compute_accuracies(test_set, baseline(train_set, strip_tags(test_set)))
    else:
        print("You are running the Viterbi algorithm!")
        accuracy = compute_accuracies(test_set, viterbi(train_set, strip_tags(test_set)))
    print("Accuracy:",accuracy)
Exemplo n.º 26
0
def find_CpG_islands_example2(gene):
    # Stati nascosti e osservabili
    S = np.array(["AI", "CI", "GI", "TI", "AN", "CN", "GN", "TN"])
    SY = np.array(["A", "C", "G", "T"])

    # Matrice transizione
    m = [[
        1.85152516e-01, 2.75974026e-01, 4.00289017e-01, 1.37026750e-01,
        3.19045117e-04, 3.19045117e-04, 6.38090233e-04, 2.81510397e-04
    ],
         [
             1.89303979e-01, 3.58523577e-01, 2.52868527e-01, 1.97836007e-01,
             4.28792308e-04, 5.72766368e-04, 3.75584503e-05, 4.28792308e-04
         ],
         [
             1.72369088e-01, 3.29501650e-01, 3.55446538e-01, 1.40829292e-01,
             3.39848138e-04, 4.94038497e-04, 7.64658311e-04, 2.54886104e-04
         ],
         [
             9.38783432e-02, 3.40823149e-01, 3.75970400e-01, 1.86949063e-01,
             2.56686367e-04, 5.57197235e-04, 1.05804868e-03, 5.07112091e-04
         ],
         [
             0.00000000e+00, 3.78291020e-05, 0.00000000e+00, 0.00000000e+00,
             2.94813496e-01, 1.94641138e-01, 2.86962055e-01, 2.23545482e-01
         ],
         [
             0.00000000e+00, 7.57154865e-05, 0.00000000e+00, 0.00000000e+00,
             3.26811872e-01, 2.94079570e-01, 6.17258712e-02, 3.17306971e-01
         ],
         [
             0.00000000e+00, 5.73810399e-05, 0.00000000e+00, 0.00000000e+00,
             2.57133507e-01, 2.33483327e-01, 2.94234944e-01, 2.15090841e-01
         ],
         [
             0.00000000e+00, 3.11417347e-05, 0.00000000e+00, 0.00000000e+00,
             1.79565378e-01, 2.32469115e-01, 2.94623408e-01, 2.93310958e-01
         ]]
    M = pd.DataFrame(m, columns=S, index=S)

    # Matrice probabilita' di emissione
    d = np.eye(4)
    E = pd.DataFrame(np.concatenate([d, d]), columns=SY, index=S)

    # Probabilita' iniziali
    pinizio = pd.DataFrame([[1 / 8] * 8], columns=S)

    path = viterbi(M, E, S, pinizio, gene)

    for i in range(len(gene)):
        print(gene[i] + "  ", end=" ")
    print("\n")
    for i in range(len(path)):
        print(path[i][-1] + "  ", end=" ")
    print("\n")
Exemplo n.º 27
0
def menu():
    print "1 - EBI Web Service Access"
    print "2 - Hidden Markov Model"
    print "3 - Phylogenetic Tree"
    option = input("Select an option: ")
    if option == 1:
        print "1a - Global Alignment Tool"
        print "1b - Local Alignment Tool"
        option2 = raw_input("Select an option: ")
        if option2 == "1a":
            globalAlign()
        elif option2 == "1b":
            localAlign()
    if option == 2:
        viterbi()
    if option == 3:
        print "3a - UPGMA"
        option2 = raw_input("Select an option: ")
        if option2 == "3a":
            upgma()
Exemplo n.º 28
0
def t_BMES():
    PI, A, B = build()
    S = B.keys()
    for k in S:
        if k not in PI:
            PI[k] = 0.0
    for sen in samples:
        Y = tuple(sen)
        prob, X = viterbi(Y, S, PI, A, B)
        print u''.join(sen[i] + (X[i] in 'ES' and '|' or '')
                       for i in xrange(len(sen)))
Exemplo n.º 29
0
def main():
    mode = sys.argv[1]
    test = float(sys.argv[2])
    data = pickle.load(open(sys.argv[3], 'rb'))

    if mode == '-c':
        train_method = HMM.count
    elif mode == '-l':
        train_method = HMM.learn
    else:
        print("INVALID MODE")

    print("Data preprocessing...")
    test_i = random.sample(range(len(data)), int(test*len(data)))

    states = list(set(list(itertools.chain(*data))[1::2]))

    random.shuffle(data)
    train = data[int(len(data)*test):]
    test  = data[:int(len(data)*test)]

    vocab = list(set(list(itertools.chain(*train))[0::2]))

    state_map = {state:i for i,state in enumerate(states)}
    vocab_map = {word:i for i,word in enumerate(vocab)}

    for i,sentence in enumerate(train):
        words_i = [vocab_map[x] for x in sentence[0::2]]
        states_i =[state_map[x] for x in sentence[1::2]]
        train[i] = [words_i, states_i]

    print("Training HMM...")
    A,B,pi = train_method(train, len(vocab), len(states))

    correct = 0
    total = 0

    for sentence in test:
        words =  sentence[0::2]
        states = sentence[1::2]

        for i,word in enumerate(words):
            if word in vocab_map:
                words[i] = vocab_map[word]
            else:
                words[i] = '<UNK>'

        p_states = viterbi(A,B,pi,words)
        total += len(states)
        for i in range(len(states)):
            if states[i] == p_states[i]:
                correct += 1

    print('Accuracy: '+str(correct/total))
Exemplo n.º 30
0
def evaluate():
    global possible_tags
    global strings
    global cca_length
    get_words()
    get_strings()
    get_alpha()
    get_phi()
    get_regExp()
#    get_codeWords()
    get_cca()
#    cca_length = len(cca1['amended'])
    cca_length = 20
    data = open('inputs/eng.test{0}'.format(sys.argv[2]), 'r')
    s = 'outputs_cca_pos_egw30_rounding_currentOnly/result_{0}_{1}.txt'.format(sys.argv[2], sys.argv[1])
    output = open(s, 'w')
    line = data.readline()
    output.write('{0}\n\n'.format(line.strip()))
    line = data.readline()
    vals = get_sentence(data)
    sentence = vals[0]
    correct_tags = vals[1]
    POS = vals[2]
    count = 0
    time1 = 0.0
    time2 = 0.0
    avg_time = 0.0
    time_val = 0.0
    first = True
    while sentence:
#------------------------
#-------TIME-STATS-------
#------------------------
        count += 1
        time2 = time()
        if not first:
            avg_time = (avg_time*(count-1)+(time2-time1))/count
            time_val = int((avg_time)*(number_of_sentences-count))
        first = False
        progress = open('progress_test.txt', 'w')
        progress.write('Percent complete:\n{0}/{1} = {2}%\n\nTime remaining: \n{3} h {4} min {5} sec'.format(int(count), int(number_of_sentences), float(count*100)/float(number_of_sentences), time_val/3600, (time_val%3600)/60, time_val%60))
        time1 = time2
        progress.close()
#--------------------------
#--------------------------
        tags = viterbi.viterbi(sentence, POS, phi, possible_tags, alpha, strings, Words, regExp, codes, cca1, cca_length)
        for i in range(len(sentence)):
            output.write('{0} {1} {2} {3}\n'.format(sentence[i], POS[i][0], correct_tags[i], tags[i]))
        output.write('\n')
        vals = get_sentence(data)
        sentence = vals[0]
        correct_tags = vals[1]
        POS = vals[2]
Exemplo n.º 31
0
 def test_trellis(self):
     _, actual, _ = viterbi(self.obs, self.A, self.B, self.pi)
     expected = LMatrix(("H", "L"),
                        xrange(len(self.obs)),
                        data = np.array([
                            [ -2.737, -5.474, -8.211, -11.533, -14.007, -17.329, -19.54, -22.862, -25.657],
                            [ -3.322, -6.059, -8.796, -10.948, -14.007, -16.481, -19.54, -22.014, -24.487]
                        ])
     )
     for s in actual.rlabels:
         for t in actual.clabels:
             self.assertAlmostEqual(actual[s,t], expected[s,t], 3)
Exemplo n.º 32
0
    def _do_viterbi(self, sentence):
        word_list = []
        viterbi_results = np.array([])
        for word, tag in sentence:
            if word in self.word_to_index:
                word_list.append(self.word_to_index[word])
            else:
                # If we encountered an OOV word call Viterbi on the previous words
                if len(word_list) != 0:
                    viterbi_results = np.concatenate(
                        (viterbi_results,
                         viterbi(word_list, self.A, self.B, self.Pi)))
                viterbi_results = np.append(viterbi_results, None)
                word_list = []
        # Call Viterbi on the last chunk of the sentence (it may be the full sentence, if there
        # wasn't an OOV word in this sentence)
        if len(word_list) != 0:
            viterbi_results = np.concatenate(
                (viterbi_results, viterbi(word_list, self.A, self.B, self.Pi)))

        return viterbi_results
Exemplo n.º 33
0
def tagger(sentence):
    Pi, A, B = load_parameters("data/HMMTagger.parameters.npz")
    with open("data/Taggerindex.pkl", "rb") as f:
        word_index = pickle.load(f)
        label_index = pickle.load(f)
    f.close()
    obs = map_obs(word_index, sentence)
    prob, route = viterbi(obs, Pi, A, B)
    sequence = [label_index[i] for i in route]
    result = ''
    for word, tag in zip(sentence, sequence):
        result += ''.join(' ' + word + '/' + tag + ' ')
    return result.strip()
Exemplo n.º 34
0
    def evaluate(self, data):
        word_level_acc = 0.0
        sentence_level_acc = 0.0
        count_words = 0
        for sentence in data:
            word_list = []
            begin_index = 0
            correct_sentence = True
            for i in range(len(sentence)):
                count_words += 1
                word = sentence[i][0]
                label = sentence[i][1]
                if word not in self.unique_words:  # OOV word , breaks the sentence.
                    if len(word_list) > 0:
                        seq = vt.viterbi(word_list, self.A, self.B, self.Pi)
                        word_acc, sentence_acc = self.comparisons(
                            begin_index, i, sentence,
                            seq)  # Computes segment accuracy
                        correct_sentence = correct_sentence and sentence_acc
                        word_level_acc += word_acc
                    word_list.clear()
                    rand_label = np.random.randint(0, len(
                        self.A))  # Assign random label
                    begin_index = i + 1
                    if rand_label == label:
                        word_level_acc += 1
                else:
                    word_list.append(self.unique_words[word])
            # If we reach the end of the sentence
            if len(word_list) > 0:
                seq = vt.viterbi(word_list, self.A, self.B, self.Pi)
                word_acc, sentence_acc = self.comparisons(
                    begin_index, len(sentence), sentence, seq)
                correct_sentence = correct_sentence and sentence_acc
                word_level_acc += word_acc
            if correct_sentence:
                sentence_level_acc += 1

        return word_level_acc / count_words, sentence_level_acc / len(data)
Exemplo n.º 35
0
def perceptron(print_alpha = 0, mult = 0, import_alpha = 0):
    global alpha
    global alpha_average
    global possible_tags
    global strings
    global strings_abr
    global add_factor
    global mult_factor
    init_phi_alpha(mult)
    get_strings()
    if import_alpha:
        read_alpha()
    alpha_average = copy.deepcopy(alpha)
    for t in range(T_DEFAULT):
        print '---{0}---'.format(t)
        sys.stdout.flush()
        dont_repeat = True
        data = open(sys.argv[2], 'r')
        vals = get_sentence_and_tags(data)
        j = 0
        while vals:
            sentence = vals[0]
            correct_tags = vals[1]
            result = viterbi.viterbi(sentence, phi, possible_tags, alpha, strings, strings_abr, mult)
            z = result[0]
            indices = result[1]
            if not z == correct_tags:
                dont_repeat = False
                correct_indices = get_indices(sentence, correct_tags)
                if mult:
                    for i in indices:
                        alpha[i] = float(alpha[i])/mult_factor
                    for i in correct_indices:
                        alpha[i] = float(alpha[i])*mult_factor
                else:
                    for i in indices:
                        alpha[i] += -1*add_factor
                    for i in correct_indices:
                        alpha[i] += add_factor
            else:
                j += 1
            for i in range(len(alpha)):
                alpha_average[i] += alpha[i]
            vals = get_sentence_and_tags(data)
        data.close()
        if dont_repeat:
            print 'SUCCESS!!!'
            break
#        print 'number correct: {0}'.format(j)
        if print_alpha:
            write_alpha(t)
Exemplo n.º 36
0
def perceptron(print_alpha = 0):
    global possible_tags
    global strings
    global strings_abr
    global add_factor
    get_regExp()
    get_strings()
    get_tags()
    get_phi()
    for t in range(T_DEFAULT):
        print '---{0}---'.format(t)
        sys.stdout.flush()
        dont_repeat = True
        data = open(sys.argv[1], 'r')
        vals = get_sentence_and_tags(data)
        j = 0
        examp_num = 0
        while vals:
            examp_num += 1
            sentence = vals[0]
            correct_tags = vals[1]
            tags = viterbi(sentence, phi, possible_tags, alpha, strings, strings_abr, Words, regExp)
            indices = get_indices(sentence, tags, examp_num)
            correct_indices = get_indices(sentence, correct_tags, examp_num)
            if not tags == correct_tags:
                dont_repeat = False
                for i in indices:
                    alpha[i] += -1*add_factor
                for i in correct_indices:
                    alpha[i] += add_factor
            else:
                j += 1
            for i in set(indices) | set(correct_indices):
                val1 = alpha_average[i][0]+(examp_num - alpha_average[i][1])*alpha_average[i][2]
                val2 = examp_num
                val3 = alpha[i]
                alpha_average[i] = (val1,val2,val3)
            vals = get_sentence_and_tags(data)
        data.close()
        if dont_repeat:
            print 'SUCCESS!!!'
            break
        print 'number correct: {0}'.format(j)
        for i in alpha:
            val1 = alpha_average[i][0]+(examp_num+1 - alpha_average[i][1])*alpha_average[i][2]
            val2 = 1
            val3 = alpha[i]
            alpha_average[i] = (val1,val2,val3)
        if print_alpha:
            write_alpha(t)
Exemplo n.º 37
0
def testTag():
    prob_start = load_model("prob_mat/prob_start.pkl")
    prob_trans = load_model("prob_mat/prob_trans.pkl")
    prob_emit = load_model("prob_mat/prob_emit.pkl")

    test_str_list = []
    test_str_list.append(u"长春市长春节讲话。")
    test_str_list.append(u"他说的确实在理.")
    test_str_list.append(u"毛主席万岁。")
    test_str_list.append(u"我有一台电脑。")
    for test_str in test_str_list:
        pos_list = viterbi(test_str, ('B', 'M', 'E', 'S'), prob_start,
                           prob_trans, prob_emit)
        print(test_str, '\n', pos_list)
Exemplo n.º 38
0
 def run(self):
     if self.isTest:
         print "Running HMM"
         h = HiddenMarkovModel(self.train_file,smoothed=self.smoothing)
         print "Running Viterbi"
         toc = time.clock()
         predicted = viterbi(h,self.test_file, test = False)
         tic = time.clock()
         print "Viterbi ran in %f seconds"%(tic-toc)
         actual, tokens = zip(*self.parse_file(self.test_answers))
         return (predicted,actual,tokens)
     else:
         print "Splitting Data"
         (train,test) = self.splitCV(self.parse_file(self.train_file),self.cv_validation_percentage)
         print "Converting Lists"
         train_text = "".join(["%s %s\n" % (p,t) for [p,t] in train])
         test_text = "".join(["%s\n" % t for [p,t] in test])
         print "Running HMM"
         h = HiddenMarkovModel(text=train_text, smoothed=self.smoothing)
         print "Running Viterbi"
         predicted = viterbi(h,text=test_text, test=False)
         actual = self.getActual(test)
         return (predicted,actual)
Exemplo n.º 39
0
def tagging(set_, tags, word_tag):
    global full_tags
    global init_table
    global full_cpd_tags
    global full_cpd_word_tag
    tagset = []
    init_table = {}
    for tag in full_tag_set:
        init_table[tag] = 0.00000000000000000001
    init_table['<s>'] = 1.0
    for i in range(0, len(set_)):
        tagset.append(
            viterbi(dict_tags, dict_words, set_[i], full_tag_set, init_table,
                    tags, word_tag))
    return combine_tag_word(set_, tagset)
Exemplo n.º 40
0
def t_wordseg():
    PI, A, B = build(True)
    for k in B.keys():
        if '|' == k[-1]:
            B[k[:-1]] = {k[:-1]: 1.0}
        else:
            B[k + '|'] = B[k]
    S = B.keys()
    for k in S:
        if k not in PI:
            PI[k] = 0.0
    for sen in samples:
        Y = tuple(sen)
        prob, X = viterbi(Y, S, PI, A, B)
        print u''.join(X)
Exemplo n.º 41
0
def t_wordseg():
    PI, A, B = build(True)
    for k in B.keys():
        if '|' == k[-1]:
            B[k[:-1]] = {k[:-1]: 1.0}
        else:
            B[k + '|'] = B[k]
    S = B.keys()
    for k in S:
        if k not in PI:
            PI[k] = 0.0
    for sen in samples:
        Y = tuple(sen)
        prob, X = viterbi(Y, S, PI, A, B)
        print u''.join(X)
Exemplo n.º 42
0
def __cut(sentence):
    prob, pos_list =  viterbi.viterbi(sentence,char_state_tab_P, start_P, trans_P, emit_P)
    begin, next = 0,0

    for i,char in enumerate(sentence):
        pos = pos_list[i][0]
        if pos=='B':
            begin = i
        elif pos=='E':
            yield pair(sentence[begin:i+1], pos_list[i][1])
            next = i+1
        elif pos=='S':
            yield pair(char,pos_list[i][1])
            next = i+1
    if next<len(sentence):
        yield pair(sentence[next:], pos_list[next][1] )
def viterbi_run(training, test_file): 

	#returns a list of sentence list containing tuples (word,part of speech) 
	

	corpus_list = viterbi.corpus_list(training) 

	#creates a dictionary of corpus part of speech tag : occurences 
	corpus_dictionary = viterbi.corpus_dictionary(training) 

	#pos_keys 
	keys = viterbi.key_list(corpus_dictionary) 

	#creates the prior_probabilities transitions table for the entire corpus 


	prior_probabilities_table = viterbi.transition_table(corpus_dictionary,corpus_list)


	#creates a word dictionary 
	#word: list of part of speeches and increment occurences of word as part of speech 
	word_dic = viterbi.word_dic(corpus_list,keys) 


	#word_keys
	words = viterbi.key_list(word_dic)


	#likelihood_table 
	likelihood_table  = viterbi.word_freq(corpus_dictionary,word_dic)


	#Emissions and Transitions 
	sentences = viterbi.corpus_list_2(test_file) 

	error_list = [] 
	error_list_i = [] 
	new_sentences = [] 
	count = 0 
	for sentence in sentences:
		trans = viterbi.sentence_tag(sentence,keys,words,likelihood_table)
		s_pos = viterbi.sentence_pos(trans)
		transition_table = viterbi.transition_probabilities(trans,s_pos,prior_probabilities_table,keys)

		observed_like = viterbi.observed_likelihoods(sentence,s_pos,trans,likelihood_table,words,keys)
		vit_sent = viterbi.viterbi(observed_like,sentence,s_pos,transition_table) 
Exemplo n.º 44
0
def evaluate():
    global possible_tags
    global strings
    global strings_abr
    get_words()
    get_strings()
    get_alpha()
    get_phi()
    get_regExp()
    data = open(sys.argv[4], 'r')
    output = open(sys.argv[5], 'w')
    sentence = get_sentence(data)
    while sentence:
        tags = viterbi.viterbi(sentence, phi, possible_tags, alpha, strings, strings_abr, Words, regExp)
        for i in range(len(sentence)):
            output.write('{} {}\n'.format(sentence[i], tags[i]))
        output.write('\n')
        sentence = get_sentence(data)
Exemplo n.º 45
0
def gen_couplet(transition_prob_tree, output_prob_tree, unigram_freq, first_half):
    assert type(first_half) == unicode
    couplet_length = len(first_half)
    visible_words = np.array([first_half[i] for i in range (couplet_length)])
    hidden_candidate_words = np.array([u' ' for _ in range(top_k_word*couplet_length)]).reshape(top_k_word, couplet_length)
    output_prob = np.random.rand(top_k_word, couplet_length)
    for i in range(couplet_length):
        key = first_half[i]
        if not output_prob_tree.has_key(key):
            print '%s, Cannot generate couplet' % key
            return ''

        hash_leaf = output_prob_tree[key]
        hidden_candidate_words[:,i], output_prob[:,i] = gen_candidates(first_half, hash_leaf, top_k_word)

    for i in range(couplet_length):
        candidate = u''
        for j in range(top_k_word):
            candidate += hidden_candidate_words[j, i]

    try:
        transition_prob, init_prob = init_model(transition_prob_tree, unigram_freq, hidden_candidate_words, top_k_word)
    except:
        return ''

    optimal_path, prob = viterbi(transition_prob, output_prob, init_prob, [], visible_words, top_k_word, top_k_candidate)
    optimal_path = deal_repeat(first_half, optimal_path)

    results = []
    for i in range(optimal_path.shape[0]):
        second_half = ''
        for j in range(optimal_path.shape[1]):
            second_half += hidden_candidate_words[optimal_path[i, j], j]
        score = ranking_function(output_prob_tree, first_half, second_half)
        results.append((score, second_half))


    results = sorted(results, reverse=True)[:top_k_output]
    return results
Exemplo n.º 46
0
def k_fold_cross_valid_known(k, parsed, known, discounts):
    res = defaultdict(list)
    for train, test in _fold(parsed, k):
        for discount in discounts:
            print 'train: ', len(train), 'test: ', len(test)
            tag2id, word2id = build_dict(parsed)
            id2tag = {v: k for k, v in tag2id.iteritems()}
            id2word = {v: k for k, v in word2id.iteritems()}
            emission, transition = _counter_known(parsed, train, known,
                                                  0.85, tag2id, word2id, discount)

            count_ok, count_total = 0., 0.
            for i, seq in enumerate(test):
                out = viterbi(seq, transition, emission, word2id, tag2id)
                ok, total = _compare(seq[1:-1], id_to_token(out, id2word, id2tag))
                count_ok += ok; count_total += total
                if DEBUG:
                    print 'evaluating', i, 'th sentence.', count_ok/count_total, 'so far.'
            res[discount].append(count_ok/count_total)
            print 'Fold accuracy: ', res[discount][-1], 'discount: ', discount
    for d in res:
        print 'discount:', d, '->', 'avg:', np.mean(res[d])
Exemplo n.º 47
0
def perceptron(print_alpha = 0):
    global possible_tags
    global strings
    global strings_abr
    global add_factor
    get_regExp()
    get_strings()
    get_tags()
    for t in range(T_DEFAULT):
        print '---{0}---'.format(t)
        sys.stdout.flush()
        dont_repeat = True
        data = open(sys.argv[1], 'r')
        vals = get_sentence_and_tags(data)
        j = 0
        while vals:
            sentence = vals[0]
            correct_tags = vals[1]
            tags = viterbi(sentence, phi, possible_tags, alpha, strings, strings_abr, Words, regExp)
            indices = get_indices(sentence, tags)
            if not tags == correct_tags:
                dont_repeat = False
                correct_indices = get_indices(sentence, correct_tags)
                for i in indices:
                    alpha[i] += -1*add_factor
                for i in correct_indices:
                    alpha[i] += add_factor
            else:
                j += 1
            for i in alpha:
                alpha_average[i] += alpha[i]
            vals = get_sentence_and_tags(data)
        data.close()
        if dont_repeat:
            print 'SUCCESS!!!'
            break
        print 'number correct: {0}'.format(j)
        if print_alpha:
            write_alpha(t)
Exemplo n.º 48
0
def test_model(corpus):
	cp = corpus.corpus_sentence

	word_list = list()
	pos_list = list()
	
	for paragraph in cp:
		text = []
		pos = []

		for tp in paragraph:
			text.append(tp[0])
			pos.append(tp[1])

		word_list.append(text)
		pos_list.append(pos)

	initp, trans_bi, emiss = corpus.get_statistics_model(tri_gram=False)
	_, trans_tri, emiss = corpus.get_statistics_model(tri_gram=True)

	bigram_result = []
	trigram_result = []

	count = 0
	for paragraph in word_list:
		pos_bi = vtb.viterbi(paragraph, corpus.pos_list_sentence, initp, trans_bi, emiss)
		# pos_tri = vtb.viterbi_trigram(paragraph, corpus.pos_list_sentence, initp, trans_tri, emiss)

		bigram_result.append(pos_bi)
		# trigram_result.append(pos_tri)

		print(count)
		count += 1
		if count == 1000:
			break

	tp, tn, fp, fn, other = evaluate_sentence(pos_list[0:1000], bigram_result)
	write_results_to_file("test/test_model_orchid_bigram", word_list[0:1000], pos_list, bigram_result, tp, tn, fp, fn, other, test_text="bigram model test")
Exemplo n.º 49
0
H = [Health(0,'Healthy'), Health(1,'Fever')]
 
observed = [Symptom('normal'), 
            Symptom('cold'),
            Symptom('dizzy')]
 
start_p = [0.6, 0.4]  # index 0 'Healthy'
                      # index 1 'Fever'

'''transition_probability = {
   'Healthy' : {'Healthy': 0.7, 'Fever': 0.3},
   'Fever' : {'Healthy': 0.4, 'Fever': 0.6},
   }'''
'''emission_probability = {
   'Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
   'Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6},
}'''

T = numpy.matrix(
    [[0.7, 0.3],
   [0.4, 0.6]])

def health_p(state, emission):
    e_p = [{'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
           {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}]
    return e_p[state.i][emission.feeling]

xpath = viterbi.viterbi(observed,H,T,start_p,health_p)
for x in xpath:
    print x
Exemplo n.º 50
0
 def test_backtrace(self):
     _, _, actual = viterbi(self.obs, self.A, self.B, self.pi)
     expected = [{'H': 'H', 'L': 'H'}, {'H': 'H', 'L': 'H'}, {'H': 'H', 'L': 'H'}, {'H': 'L', 'L': 'L'}, {'H': 'H', 'L': 'L'}, {'H': 'L', 'L': 'L'}, {'H': 'H', 'L': 'L'}, {'H': 'L', 'L': 'L'}]
     self.assertEqual(actual, expected)
Exemplo n.º 51
0
 def test_state_sequence(self):
     actual, _, _ = viterbi(self.obs, self.A, self.B, self.pi)
     expected = ("H", "H", "H", "L", "L", "L", "L", "L", "L")
     self.assertEqual(actual, expected)
Exemplo n.º 52
0
generate_d_p(d_p, B)

dprint("\nProbabilities:")
if DEBUG:
    for b in B:  
        dprint("{0} \nwith e={1} and d={2}".format(b,e_p[b.i],d_p[b.i],sum(e_p[b.i].values()),sum(d_p[b.i].values())))


# print "\nSafety check:"
# print accent_p(B[7],S[1])
# print duration_p(B[7],S[0])

# Two emission functions
# xpath = viterbi.viterbi(S,B,T,start_p,accent_p,duration_p)
# One emission function
xpath = viterbi.viterbi(S,B,T,start_p, accent_p)

print "\nAnd they said, in great unison, that The Path shalt be:"

sounder = Sounder(5)
sendlist = [(-1,1,b) for b in range(0,5)]
for x in xpath:
    print x
    # print "Hidden state, transition values ",T[x.i]
    sendlist[x.origin] = (ra.randint(60,80),x.duration,x.origin)

print sendlist
sounder.set_notes(sendlist)
sounder.send_notes()
sounder.close()
Exemplo n.º 53
0
def main():
    global mainConfig,vit
    loadConfig()
    if (platform.system() == "Windows"):
       # in case we're using... winblows
       mainConfig['path_separator'] = "\\"
       saveConfig()
    while True:
        drawScreen()
        ans = raw_input("* Select an option: ")
        if ans == "": break

        elif (ans == '1'):
            print "Refreshing all tables from file..."
            build.refreshAll(mainConfig['learnFile'])
            vit = viterbi.viterbi(build.startProbs, build.transProbs, build.obsProbs)
            print "Refreshing complete"
            wait()
        elif (ans == '2'):
            print "Unpickling tables..."
            build.unpickleTables()
            vit = viterbi.viterbi(build.startProbs, build.transProbs, build.obsProbs)
            print "Unpickling complete."
            wait()
        elif (ans == '3'):
            print "Training text file must be in "+mainConfig['path_separator']+"train directory!"
            f = raw_input("Select a new training file: ")
            newPath = "train"+mainConfig['path_separator']+f
            if (not os.path.isfile(newPath)):
                print "ERROR: Not a valid file"
            else:
                mainConfig['learnFile'] = f
                saveConfig()
                print "Updated successfully"
            wait()
        elif (ans == '4'):
            build.printStats()
            wait()
        elif (ans == '5'):
            testLoop()
        elif (ans == '6'):
            test.runTest(mainConfig["testFile"],mainConfig["rslts"])
            wait()
        elif (ans == '7'):
            print "Testing text file must be in "+mainConfig['path_separator']+"test directory!"
            f = raw_input("Select a new training file: ")
            newPath = "test"+mainConfig['path_separator']+f
            if (not os.path.isfile(newPath)):
                print "ERROR: Not a valid file"
            else:
                mainConfig['testFile'] = f
                saveConfig()
                print "Updated successfully"
            wait()
        elif (ans == '8'):
                r = raw_input("Select desired number of results (1-14): ")
                if (r.isdigit() and (int(r) in range(1,15))):
                    mainConfig['rslts'] = int(r)
                    saveConfig()
                    print "Updated successfully"
                else:
                    print "ERROR: Not a valid option. Enter 1-14 only."
                wait()
Exemplo n.º 54
0
def predict_one(weight, words, possible_tags, transition):
    return " ".join(viterbi(weight, words, possible_tags, transition))
Exemplo n.º 55
0
import numpy as np
from viterbi import viterbi


if __name__ == '__main__':
    n_hid = 2
    n_obs = 3

    trans_hid = np.array( [ [0.5,0.5], [0.5,0.5] ] )
    trans_obs = np.array( [ [0.5,0.4,0.1], [0.4,0.1, 0.5] ])

    solver = viterbi(n_hid, n_obs, trans_hid, trans_obs)

    obs = np.array( [0,1,1,0,2,0,2,2,2,0,2,2,2,2,2,0,0,1,1,2] )

    mlp = solver.get_MLP(obs)

    print mlp
Exemplo n.º 56
0
def main():

    if dimension==1 :
#        gmm = np.zeros(number_of_components*size)
#        mu = np.zeros(number_of_components)
#        sigma = np.zeros(number_of_components)
#        for i in range(number_of_components) :
#            gmm[i*size:(i+1)*size], mu[i], sigma[i] = create_data(dimension,size,i)
        gmm = np.zeros((1,number_of_components*size),dtype=float)
        mu = np.zeros((number_of_components,1),dtype=float)
        sigma = np.zeros((number_of_components,1,1),dtype=float)
        matrix = np.zeros((number_of_components,number_of_components),dtype=float)

#        for i in range(number_of_components):
#            x, mu[i,0], sigma[i,0,0] = create_data(dimension,size,i)
    else:
        gmm = np.zeros((dimension,number_of_components*size),dtype=float)
        mu = np.zeros((number_of_components,dimension),dtype=float)
        sigma = np.zeros((number_of_components,dimension,dimension),dtype=float)
        matrix = np.zeros((number_of_components,number_of_components),dtype=float)

#        for i in range(number_of_components):
#            x, mu[i,:], sigma[i,:,:] = create_data(dimension,size,i)

    weights = np.array([0.6, 0.4])
    matrix = np.array([[0.7, 0.3], [0.1, 0.9]])
    model = hmm.GaussianHMM(2, "full", weights, matrix)
    model.means_ = mu
    model.covars_ = sigma
    gmm, Z = model.sample(number_of_components*size)

#    else :
#        gmm = np.zeros((dimension,number_of_components*size))
#        mu = np.zeros((number_of_components,dimension))
#        sigma = np.zeros((number_of_components,dimension,dimension))
#        for i in range(number_of_components) :
#            gmm[:,i*size:(i+1)*size], mu[i,:], sigma[i,:,:] = create_data(dimension,size,i)

    means, variances, pi, a = emHMM_algorithm(gmm,dimension,number_of_components,number_of_components*size)

#    num_bins = 50
#    n, bins, patches = plt.hist(gmm, num_bins, normed=1, facecolor='green', alpha=0.5)
#    # add a 'best fit' line
#    for i in range(number_of_components) :
#        y = mlab.normpdf(bins, means[i], variances[i])
#        plt.plot(bins, y, 'r--')
#        plt.xlabel('Values')
#        plt.ylabel('Probability')
#        plt.title('Data Histogram vs predicted distribution')
#
#    # Tweak spacing to prevent clipping of ylabel
#    plt.subplots_adjust(left=0.15)
#    plt.show()

    b = np.zeros((number_of_components,number_of_components*size))

    #Evaluate posterior
    if dimension==1:
        for i in range(number_of_components) :
        # Calculate the probability of seeing the observation given each state
            pdf = pi[i]*mlab.normpdf(gmm, means[i], variances[i,0])
            b[i,:] = pdf[:,0]

    else:
        centered_data = np.zeros((number_of_components,number_of_components*size,dimension))
        den = np.zeros((number_of_components,number_of_components*size))
        num = np.zeros((number_of_components,number_of_components*size))
        for i in range(number_of_components) :
        # Calculate the probability of seeing the observation given each state
            for n in range(number_of_components*size):
                centered_data[i, n, :] = gmm[n, :]-means[i, :]
                den[i,n] = np.sqrt((2*math.pi)**(dimension)*np.linalg.det(variances[i,:,:]))
                num[i,n] = np.exp((-1/2)*np.dot(np.dot(centered_data[i,n,:][np.newaxis],np.linalg.inv(variances[i,:,:])),centered_data[i,n,:][:,np.newaxis]))
                b[i,n] = num[i,n] / den[i,n]


    # Predict
    path, x, y = viterbi(size*number_of_components,a,b,pi)
    plt.figure();
    plt.plot(path[0,:],'ro')
    plt.plot(path[0,:],'r')
    plt.plot(Z,'g')
    plt.show()
    if dimension==1:
        print "initial means: ", mu[:,0], "\n", "initial variances: ", sigma[:,0,0], "\n", "initial weights: ", weights, "\n"
        print "means:", means, "\n" "sigmas:", variances, "\n", "weights:", pi, "\n"
        print "initial mixing mgmmatrix:", matrix, "\n"
        print "mixing matrix:", a, "\n"
    else:
        print "initial means: ", mu, "\n", "initial variances: ", sigma, "\n", "initial weights: ", weights, "\n"
        print "means:", means, "\n" "sigmas:", variances, "\n", "weights:", pi, "\n"
        print "initial mixing matrix:", matrix, "\n"
        print "mixing matrix:", a, "\n"
Exemplo n.º 57
0
Arquivo: test.py Projeto: amm385/POS
turned
against
many
dictators
,
but
none
quite
so
resourceful
.
"""

use_filename = False

#cProfile.run("viterbi(hmm, filename='../test-big-sample.pos')")

tic = time.clock()
if use_filename:
    pos = viterbi(hmm, filename=filename)
    toc = time.clock()
    with open(output_filename, 'w') as fp:
        fp.write('\n'.join(pos))
else:
    text = string 
    #"\n".join(string.split(' '))
    pos = viterbi(hmm, text=text, test=False)
    toc = time.clock()
    print str(pos)
print str(toc-tic)
Exemplo n.º 58
0
 def tag_sen_feats(self, sen_feats):
     logTagProbsByPos = self.getLogTagProbsByPos(sen_feats)
     _, bestTagging = viterbi(self.transProbs, logTagProbsByPos,
                              self.lmw)
     return bestTagging
 def viterbi_decode(self,seq):
     node_potentials,edge_potentials = self.build_potentials(seq)
     viterbi_path,_ = viterbi(node_potentials,edge_potentials)
     res =  viterbi_path
     new_seq =  seq.update_from_sequence(res)
     return new_seq