示例#1
0
        f_in.close()
        return tbl

if __name__ == "__main__":
        prior_tbl = gen_prior(sys.argv[1])
        test = open(sys.argv[2], 'r')
        output = open(sys.argv[3], 'w')
        for line in test:
                line=line.strip('\n')
                if len(line) == 0:
                        output.write('\n')
                        continue
                alias = line
                if alias not in prior_tbl:
                        #using get_types to assign a new value for rare words(in this case it is '_RARE_')
                        alias = get_type(alias, '')
                val = float('-inf')
                tag = ''
                cand_set = prior_tbl[alias]
                #find the tag with maximal emission probability
                for cand in cand_set: 
                        tmp_max = cand_set[cand] 
                        if tmp_max > val:
                                val = tmp_max 
                                tag = cand 
                output.write(line + ' ' + tag + ' ' + str(val) + '\n')
        test.close()
        output.close()
            

示例#2
0
def viterbi(x, S, q, e, opt_type):
        n = len(x)
        #initialization for the DP
        pi = {(-1,'*','*'):0}
        trace = {(-1,'*','*'):[]}
        for u in S:
                for v in S:
                        if u == '*' and v == '*':
                                continue
                        pi[-1,u,v] = float('-inf') 
                        trace[-1,u,v] = []
        for k in range(0, n): 
                x_k = x[k]
                if x_k not in e:
                        #replace for the rare words
                        x_k = get_type(x_k, opt_type)
                for u in S:
                        for v in S:
                                max_prob = float('-inf');
                                max_w = ''
                                e_xv = float('-inf')
                                if v in e[x_k]:
                                        e_xv = e[x_k][v]
                                for w in S:
                                        q_wuv = float('-inf')
                                        pi_kwu = float('-inf')
                                        if (w,u,v) in q:
                                                q_wuv = q[w,u,v]
                                        if (k-1,w,u) in pi:
                                                pi_kwu = pi[k-1,w,u]
                                        #update the new probability
                                        tmp_prob = pi_kwu + q_wuv + e_xv
                                        #find argmax_w
                                        if tmp_prob > max_prob:
                                                max_prob = tmp_prob
                                                max_w = w
                                if max_w != '':
                                        tl = list(trace[k-1,max_w,u])
                                        tl.append((max_w,max_prob))
                                        trace[k,u,v] = tl 
                                        pi[k,u,v] = max_prob
        cand_u = ''
        cand_v = ''
        max_prob = float('-inf')
        #find argmax_uv(pi(n-1,u,v)q('STOP'|u,v))
        for u in S:
                for v in S:
                        q_wuv = float('-inf')
                        pi_kuv = float('-inf')
                        if (u,v,'STOP') in q:
                                q_wuv = q[u,v,'STOP']
                        if (n-1,u,v) in pi:
                                pi_kuv = pi[n-1,u,v]
                        tmp_prob = pi_kuv + q_wuv
                        if tmp_prob > max_prob:
                                max_prob = tmp_prob
                                cand_u = u
                                cand_v = v
        taglist = list(trace[n-1,cand_u,cand_v])
        taglist.append((cand_u, 'misc'))
        taglist.append((cand_v, 'misc'))
        #find the trace of the tags
        out = ''
        for k in range(0, n - 1):
                out = out + x[k] + ' ' + taglist[k+2][0] + ' ' + str(taglist[k][1]) + '\n'
        out = out + x[n - 1] + ' ' + cand_v + ' ' + str(max_prob) + '\n'
        return out
示例#3
0
# argument3: switch for the advanced replacement for the rare words

if __name__ == "__main__":
        f_in = open(sys.argv[1], 'r')
        f_out = open(sys.argv[2], 'w')
        opt_type = sys.argv[3];
        tbl = {}
        pattern = re.compile('(\S+)\s(\S+)')
        for line in f_in:
                match = pattern.match(line)
                if match:
                        word = match.group(1)
                        if word in tbl:
                                tbl[word] = tbl[word] + 1 
                        else:
                                tbl[word] = 1
        f_in.seek(0, 0)
        for line in f_in:
                match = pattern.match(line)
                if match:
                        word = match.group(1)
                        #check the word frequency and process the replacement
                        if tbl[word] < 5:
                                f_out.write( get_type(word, opt_type) + ' ' + match.group(2) + '\n')
                        else:
                                f_out.write(line)
                else:
                        f_out.write(line)
        f_in.close()
        f_out.close()