def process_files(in_file, out_file, vocab_file, freq, size): """ Read data from in_file, and output to out_file """ sys.stderr.write('# in_file = %s, out_file = %s, freq=%d, size=%d\n' % (in_file, out_file, freq, size)) if vocab_file=='': if size!=-1: vocab_file = in_file + '.vocab.' + str(size) else: vocab_file = in_file + '.vocab.f' + str(freq) # load vocab unk = '<unk>' (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size, unk=unk) unk_id = str(vocab_map[unk]) sys.stderr.write('# vocab_size=%d, unk_id=%s\n' % (vocab_size, unk_id)) line_id = 0 sys.stderr.write('# Processing file %s ...\n' % (in_file)) inf = codecs.open(in_file, 'r', 'utf-8') ouf = codecs.open(out_file, 'w', 'utf-8') for line in inf: indices = [str(vocab_map[token]) if token in vocab_map else unk_id for token in re.split('\s+', line.strip())] ouf.write('%s\n' % ' '.join(indices)) line_id = line_id + 1 if (line_id % 10000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write('Done! Num lines = %d\n' % line_id) inf.close() ouf.close()
def process_files(in_file, vocab_file, freq, size): """ Read data from in_file, and output to vocab_file """ sys.stderr.write('# in_file = %s, vocab_file = %s, freq=%d, size=%d\n' % (in_file, vocab_file, freq, size)) # load/create vocab (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size) sys.stderr.write('# vocab_size=%d\n' % (vocab_size))
def process_files(in_file, out_file, vocab_file, freq, size): """ Read data from in_file, and output to out_file """ sys.stderr.write('# in_file = %s, out_file = %s, freq=%d, size=%d\n' % (in_file, out_file, freq, size)) if vocab_file == '': if size != -1: vocab_file = in_file + '.vocab.' + str(size) else: vocab_file = in_file + '.vocab.f' + str(freq) # load vocab unk = '<unk>' (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size, unk=unk) unk_id = str(vocab_map[unk]) sys.stderr.write('# vocab_size=%d, unk_id=%s\n' % (vocab_size, unk_id)) line_id = 0 sys.stderr.write('# Processing file %s ...\n' % (in_file)) inf = codecs.open(in_file, 'r', 'utf-8') ouf = codecs.open(out_file, 'w', 'utf-8') token_count = 0 unk_count = 0 for line in inf: indices = [] for token in re.split('\s+', line.strip()): token_count += 1 if token in vocab_map: indices.append(str(vocab_map[token])) else: indices.append(unk_id) unk_count += 1 ouf.write('%s\n' % ' '.join(indices)) line_id = line_id + 1 if (line_id % 10000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write( 'Done! Num lines = %d, num tokens = %d, num unks = %d, coverage = %.2f%% \n' % (line_id, token_count, unk_count, (token_count - unk_count) * 100.0 / token_count)) inf.close() ouf.close()
def process_files(in_file, out_file, vocab_file, freq, size): """ Read data from in_file, and output to out_file """ sys.stderr.write('# in_file = %s, out_file = %s, freq=%d, size=%d\n' % (in_file, out_file, freq, size)) if vocab_file=='': if size!=-1: vocab_file = in_file + '.vocab.' + str(size) else: vocab_file = in_file + '.vocab.f' + str(freq) # load vocab unk = '<unk>' (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size, unk=unk) unk_id = str(vocab_map[unk]) sys.stderr.write('# vocab_size=%d, unk_id=%s\n' % (vocab_size, unk_id)) line_id = 0 sys.stderr.write('# Processing file %s ...\n' % (in_file)) inf = codecs.open(in_file, 'r', 'utf-8') ouf = codecs.open(out_file, 'w', 'utf-8') token_count = 0 unk_count = 0 for line in inf: indices = [] for token in re.split('\s+', line.strip()): token_count += 1 if token in vocab_map: indices.append(str(vocab_map[token])) else: indices.append(unk_id) unk_count += 1 ouf.write('%s\n' % ' '.join(indices)) line_id = line_id + 1 if (line_id % 10000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write('Done! Num lines = %d, num tokens = %d, num unks = %d, coverage = %.2f%% \n' % (line_id, token_count, unk_count, (token_count-unk_count)*100.0/token_count)) inf.close() ouf.close()
def process_files(in_file, out_file, vocab_file, freq, size): """ Read data from in_file, and output to out_file """ sys.stderr.write('# in_file = %s, out_file = %s, freq=%d, size=%d\n' % (in_file, out_file, freq, size)) if vocab_file == '': if size != -1: vocab_file = in_file + '.vocab.' + str(size) else: vocab_file = in_file + '.vocab.f' + str(freq) # load vocab unk = '<unk>' (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size, unk=unk) unk_id = str(vocab_map[unk]) sys.stderr.write('# vocab_size=%d, unk_id=%s\n' % (vocab_size, unk_id)) line_id = 0 sys.stderr.write('# Processing file %s ...\n' % (in_file)) inf = codecs.open(in_file, 'r', 'utf-8') ouf = codecs.open(out_file, 'w', 'utf-8') for line in inf: indices = [ str(vocab_map[token]) if token in vocab_map else unk_id for token in re.split('\s+', line.strip()) ] ouf.write('%s\n' % ' '.join(indices)) line_id = line_id + 1 if (line_id % 10000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write('Done! Num lines = %d\n' % line_id) inf.close() ouf.close()
def process_files(in_prefix, src_lang, tgt_lang, out_prefix, freq, opt, src_vocab_size, tgt_vocab_size, unk_symbol='<unk>'): """ """ # input sys.stderr.write('# Input from %s.*\n' % (in_prefix)) src_file = in_prefix + '.' + src_lang src_inf = codecs.open(src_file, 'r', 'utf-8') tgt_file = in_prefix + '.' + tgt_lang tgt_inf = codecs.open(tgt_file, 'r', 'utf-8') align_inf = codecs.open(in_prefix + '.align', 'r', 'utf-8') if src_vocab_size>0: src_vocab_file = in_prefix + '.' + src_lang + '.vocab.' + str(src_vocab_size) elif freq>0: src_vocab_file = in_prefix + '.' + src_lang + '.vocab.f' + str(freq) (src_words, src_vocab_map, src_vocab_size) = text.get_vocab(src_file, src_vocab_file, freq, src_vocab_size, unk_symbol) if tgt_vocab_size>0: tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.' + str(tgt_vocab_size) elif freq>0: tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.f' + str(freq) (tgt_words, tgt_vocab_map, tgt_vocab_size) = text.get_vocab(tgt_file, tgt_vocab_file, freq, tgt_vocab_size, unk_symbol) # process corpus line_id = 0 debug = True bi_counts = {} # bi_counts[src_id][tgt_id] src_counts = {} tgt_counts = {} total_count = 0 # total alignment links for src_line in src_inf: src_line = src_line.strip() tgt_line = tgt_inf.readline().strip() src_tokens = re.split('\s+', src_line) tgt_tokens = re.split('\s+', tgt_line) if opt==1: # reversed alignment tgtId-srcId (t2s, s2t) = text.aggregate_alignments(align_inf.readline()) else: # normal alignment srcId-tgtId (s2t, t2s) = text .aggregate_alignments(align_inf.readline()) # process alignments for tgt_pos in t2s.keys(): for src_pos in t2s[tgt_pos]: # same word src_token = src_tokens[src_pos] tgt_token = tgt_tokens[tgt_pos] if src_token in src_vocab_map and tgt_token in tgt_vocab_map: # both known src_id = src_vocab_map[src_token] tgt_id = tgt_vocab_map[tgt_token] if src_id not in bi_counts: bi_counts[src_id] = {} src_counts[src_id] = 0 if tgt_id not in tgt_counts: tgt_counts[tgt_id] = 0 if tgt_id not in bi_counts[src_id]: bi_counts[src_id][tgt_id] = 0 # update bi_counts[src_id][tgt_id] += 1 src_counts[src_id] += 1 tgt_counts[tgt_id] += 1 total_count += 1 line_id = line_id + 1 if (line_id % 100000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write(' num lines=%d, total links=%d\n' % (line_id, total_count)) # output check_dir(out_prefix) dict_file = out_prefix + '.' + src_lang + '-' + tgt_lang + '.dict' dict_ouf = codecs.open(dict_file, 'w', 'utf-8') sys.stderr.write('# Output to %s*\n' % dict_file) # compute src_probs src_probs = {} for src_id in src_counts.keys(): src_probs[src_id] = float(src_counts[src_id])/float(total_count) # compute tgt_probs tgt_probs = {} for tgt_id in tgt_counts.keys(): tgt_probs[tgt_id] = float(tgt_counts[tgt_id])/float(total_count) # compute joint prob for src_id in bi_counts.keys(): for tgt_id in bi_counts[src_id].keys(): bi_count = bi_counts[src_id][tgt_id] if bi_count<10: continue p_src_given_tgt = float(bi_count)/float(tgt_counts[tgt_id]) p_tgt_given_src = float(bi_count)/float(src_counts[src_id]) # normalized pmi p_src_tgt = float(bi_count)/float(total_count) # joint p_src = src_probs[src_id] p_tgt = tgt_probs[tgt_id] pmi = math.log(p_src_tgt/(p_src*p_tgt)) npmi = - pmi / math.log(p_src_tgt) # print src_token = src_words[src_id] tgt_token = tgt_words[tgt_id] dict_ouf.write('%s %s %g %g %g %g %g\n' % (src_token, tgt_token, p_tgt_given_src, p_src_given_tgt, (p_src_given_tgt+p_tgt_given_src)/2, pmi, npmi)) #dict_ouf.write('%s %s %g\n' % (src_token, tgt_token, (p_src_given_tgt+p_tgt_given_src)/2)) #text.write_vocab(out_prefix + '.vocab.' + src_lang, src_words) #text.write_vocab(out_prefix + '.vocab.' + tgt_lang, tgt_words) src_inf.close() tgt_inf.close() align_inf.close() dict_ouf.close()
def process_files(in_prefix, src_lang, tgt_lang, out_prefix, freq, opt, src_vocab_size, tgt_vocab_size, unk_symbol='<unk>'): """ """ # input sys.stderr.write('# Input from %s.*\n' % (in_prefix)) src_file = in_prefix + '.' + src_lang src_inf = codecs.open(src_file, 'r', 'utf-8') tgt_file = in_prefix + '.' + tgt_lang tgt_inf = codecs.open(tgt_file, 'r', 'utf-8') align_inf = codecs.open(in_prefix + '.align', 'r', 'utf-8') if src_vocab_size > 0: src_vocab_file = in_prefix + '.' + src_lang + '.vocab.' + str( src_vocab_size) elif freq > 0: src_vocab_file = in_prefix + '.' + src_lang + '.vocab.f' + str(freq) (src_words, src_vocab_map, src_vocab_size) = text.get_vocab(src_file, src_vocab_file, freq, src_vocab_size, unk_symbol) if tgt_vocab_size > 0: tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.' + str( tgt_vocab_size) elif freq > 0: tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.f' + str(freq) (tgt_words, tgt_vocab_map, tgt_vocab_size) = text.get_vocab(tgt_file, tgt_vocab_file, freq, tgt_vocab_size, unk_symbol) # process corpus line_id = 0 debug = True bi_counts = {} # bi_counts[src_id][tgt_id] src_counts = {} tgt_counts = {} total_count = 0 # total alignment links for src_line in src_inf: src_line = src_line.strip() tgt_line = tgt_inf.readline().strip() src_tokens = re.split('\s+', src_line) tgt_tokens = re.split('\s+', tgt_line) if opt == 1: # reversed alignment tgtId-srcId (t2s, s2t) = text.aggregate_alignments(align_inf.readline()) else: # normal alignment srcId-tgtId (s2t, t2s) = text.aggregate_alignments(align_inf.readline()) # process alignments for tgt_pos in t2s.keys(): for src_pos in t2s[tgt_pos]: # same word src_token = src_tokens[src_pos] tgt_token = tgt_tokens[tgt_pos] if src_token in src_vocab_map and tgt_token in tgt_vocab_map: # both known src_id = src_vocab_map[src_token] tgt_id = tgt_vocab_map[tgt_token] if src_id not in bi_counts: bi_counts[src_id] = {} src_counts[src_id] = 0 if tgt_id not in tgt_counts: tgt_counts[tgt_id] = 0 if tgt_id not in bi_counts[src_id]: bi_counts[src_id][tgt_id] = 0 # update bi_counts[src_id][tgt_id] += 1 src_counts[src_id] += 1 tgt_counts[tgt_id] += 1 total_count += 1 line_id = line_id + 1 if (line_id % 100000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write(' num lines=%d, total links=%d\n' % (line_id, total_count)) # output check_dir(out_prefix) dict_file = out_prefix + '.' + src_lang + '-' + tgt_lang + '.dict' dict_ouf = codecs.open(dict_file, 'w', 'utf-8') sys.stderr.write('# Output to %s*\n' % dict_file) # compute src_probs src_probs = {} for src_id in src_counts.keys(): src_probs[src_id] = float(src_counts[src_id]) / float(total_count) # compute tgt_probs tgt_probs = {} for tgt_id in tgt_counts.keys(): tgt_probs[tgt_id] = float(tgt_counts[tgt_id]) / float(total_count) # compute joint prob for src_id in bi_counts.keys(): for tgt_id in bi_counts[src_id].keys(): bi_count = bi_counts[src_id][tgt_id] if bi_count < 10: continue p_src_given_tgt = float(bi_count) / float(tgt_counts[tgt_id]) p_tgt_given_src = float(bi_count) / float(src_counts[src_id]) # normalized pmi p_src_tgt = float(bi_count) / float(total_count) # joint p_src = src_probs[src_id] p_tgt = tgt_probs[tgt_id] pmi = math.log(p_src_tgt / (p_src * p_tgt)) npmi = -pmi / math.log(p_src_tgt) # print src_token = src_words[src_id] tgt_token = tgt_words[tgt_id] dict_ouf.write( '%s %s %g %g %g %g %g\n' % (src_token, tgt_token, p_tgt_given_src, p_src_given_tgt, (p_src_given_tgt + p_tgt_given_src) / 2, pmi, npmi)) #dict_ouf.write('%s %s %g\n' % (src_token, tgt_token, (p_src_given_tgt+p_tgt_given_src)/2)) #text.write_vocab(out_prefix + '.vocab.' + src_lang, src_words) #text.write_vocab(out_prefix + '.vocab.' + tgt_lang, tgt_words) src_inf.close() tgt_inf.close() align_inf.close() dict_ouf.close()