예제 #1
0
def process_files(in_file, out_file, vocab_file, freq, size):
  """
  Read data from in_file, and output to out_file
  """

  sys.stderr.write('# in_file = %s, out_file = %s, freq=%d, size=%d\n' % (in_file, out_file, freq, size))
  if vocab_file=='':
    if size!=-1:
      vocab_file = in_file + '.vocab.' + str(size)
    else:
      vocab_file = in_file + '.vocab.f' + str(freq)

  # load vocab
  unk = '<unk>'
  (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size, unk=unk)
  unk_id = str(vocab_map[unk])
  sys.stderr.write('# vocab_size=%d, unk_id=%s\n' % (vocab_size, unk_id))

  line_id = 0
  sys.stderr.write('# Processing file %s ...\n' % (in_file))
  inf = codecs.open(in_file, 'r', 'utf-8')
  ouf = codecs.open(out_file, 'w', 'utf-8')
  for line in inf:
    indices = [str(vocab_map[token]) if token in vocab_map else unk_id for token in re.split('\s+', line.strip())]
    ouf.write('%s\n' % ' '.join(indices))
    line_id = line_id + 1
    if (line_id % 10000 == 0):
      sys.stderr.write(' (%d) ' % line_id)

  sys.stderr.write('Done! Num lines = %d\n' % line_id)

  inf.close()
  ouf.close()
예제 #2
0
def process_files(in_file, vocab_file, freq, size):
    """
    Read data from in_file, and output to vocab_file
    """

    sys.stderr.write('# in_file = %s, vocab_file = %s, freq=%d, size=%d\n' % (in_file, vocab_file, freq, size))

    # load/create vocab
    (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size)
    sys.stderr.write('# vocab_size=%d\n' % (vocab_size))
예제 #3
0
def process_files(in_file, vocab_file, freq, size):
  """
  Read data from in_file, and output to vocab_file
  """

  sys.stderr.write('# in_file = %s, vocab_file = %s, freq=%d, size=%d\n' % (in_file, vocab_file, freq, size))

  # load/create vocab
  (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size)
  sys.stderr.write('# vocab_size=%d\n' % (vocab_size))
예제 #4
0
def process_files(in_file, out_file, vocab_file, freq, size):
    """
  Read data from in_file, and output to out_file
  """

    sys.stderr.write('# in_file = %s, out_file = %s, freq=%d, size=%d\n' %
                     (in_file, out_file, freq, size))
    if vocab_file == '':
        if size != -1:
            vocab_file = in_file + '.vocab.' + str(size)
        else:
            vocab_file = in_file + '.vocab.f' + str(freq)

    # load vocab
    unk = '<unk>'
    (words, vocab_map, vocab_size) = text.get_vocab(in_file,
                                                    vocab_file,
                                                    freq,
                                                    size,
                                                    unk=unk)
    unk_id = str(vocab_map[unk])
    sys.stderr.write('# vocab_size=%d, unk_id=%s\n' % (vocab_size, unk_id))

    line_id = 0
    sys.stderr.write('# Processing file %s ...\n' % (in_file))
    inf = codecs.open(in_file, 'r', 'utf-8')
    ouf = codecs.open(out_file, 'w', 'utf-8')
    token_count = 0
    unk_count = 0
    for line in inf:
        indices = []
        for token in re.split('\s+', line.strip()):
            token_count += 1
            if token in vocab_map:
                indices.append(str(vocab_map[token]))
            else:
                indices.append(unk_id)
                unk_count += 1

        ouf.write('%s\n' % ' '.join(indices))
        line_id = line_id + 1
        if (line_id % 10000 == 0):
            sys.stderr.write(' (%d) ' % line_id)

    sys.stderr.write(
        'Done! Num lines = %d, num tokens = %d, num unks = %d, coverage = %.2f%% \n'
        % (line_id, token_count, unk_count,
           (token_count - unk_count) * 100.0 / token_count))

    inf.close()
    ouf.close()
예제 #5
0
def process_files(in_file, out_file, vocab_file, freq, size):
  """
  Read data from in_file, and output to out_file
  """

  sys.stderr.write('# in_file = %s, out_file = %s, freq=%d, size=%d\n' % (in_file, out_file, freq, size))
  if vocab_file=='':
    if size!=-1:
      vocab_file = in_file + '.vocab.' + str(size)
    else:
      vocab_file = in_file + '.vocab.f' + str(freq)

  # load vocab
  unk = '<unk>'
  (words, vocab_map, vocab_size) = text.get_vocab(in_file, vocab_file, freq, size, unk=unk)
  unk_id = str(vocab_map[unk])
  sys.stderr.write('# vocab_size=%d, unk_id=%s\n' % (vocab_size, unk_id))

  line_id = 0
  sys.stderr.write('# Processing file %s ...\n' % (in_file))
  inf = codecs.open(in_file, 'r', 'utf-8')
  ouf = codecs.open(out_file, 'w', 'utf-8')
  token_count = 0
  unk_count = 0
  for line in inf:
    indices = []
    for token in re.split('\s+', line.strip()):
      token_count += 1
      if token in vocab_map:
        indices.append(str(vocab_map[token]))
      else:
        indices.append(unk_id)
        unk_count += 1

    ouf.write('%s\n' % ' '.join(indices))
    line_id = line_id + 1
    if (line_id % 10000 == 0):
      sys.stderr.write(' (%d) ' % line_id)

  sys.stderr.write('Done! Num lines = %d, num tokens = %d, num unks = %d, coverage = %.2f%% \n' % (line_id, token_count, unk_count, (token_count-unk_count)*100.0/token_count))

  inf.close()
  ouf.close()
예제 #6
0
def process_files(in_file, out_file, vocab_file, freq, size):
    """
  Read data from in_file, and output to out_file
  """

    sys.stderr.write('# in_file = %s, out_file = %s, freq=%d, size=%d\n' %
                     (in_file, out_file, freq, size))
    if vocab_file == '':
        if size != -1:
            vocab_file = in_file + '.vocab.' + str(size)
        else:
            vocab_file = in_file + '.vocab.f' + str(freq)

    # load vocab
    unk = '<unk>'
    (words, vocab_map, vocab_size) = text.get_vocab(in_file,
                                                    vocab_file,
                                                    freq,
                                                    size,
                                                    unk=unk)
    unk_id = str(vocab_map[unk])
    sys.stderr.write('# vocab_size=%d, unk_id=%s\n' % (vocab_size, unk_id))

    line_id = 0
    sys.stderr.write('# Processing file %s ...\n' % (in_file))
    inf = codecs.open(in_file, 'r', 'utf-8')
    ouf = codecs.open(out_file, 'w', 'utf-8')
    for line in inf:
        indices = [
            str(vocab_map[token]) if token in vocab_map else unk_id
            for token in re.split('\s+', line.strip())
        ]
        ouf.write('%s\n' % ' '.join(indices))
        line_id = line_id + 1
        if (line_id % 10000 == 0):
            sys.stderr.write(' (%d) ' % line_id)

    sys.stderr.write('Done! Num lines = %d\n' % line_id)

    inf.close()
    ouf.close()
예제 #7
0
def process_files(in_prefix, src_lang, tgt_lang, out_prefix, freq, opt, src_vocab_size, tgt_vocab_size, unk_symbol='<unk>'):
  """
  """
  
  # input
  sys.stderr.write('# Input from %s.*\n' % (in_prefix))
  src_file = in_prefix + '.' + src_lang
  src_inf = codecs.open(src_file, 'r', 'utf-8')
  tgt_file = in_prefix + '.' + tgt_lang
  tgt_inf = codecs.open(tgt_file, 'r', 'utf-8')
  align_inf = codecs.open(in_prefix + '.align', 'r', 'utf-8')

  if src_vocab_size>0:
    src_vocab_file = in_prefix + '.' + src_lang + '.vocab.' + str(src_vocab_size)
  elif freq>0:
    src_vocab_file = in_prefix + '.' + src_lang + '.vocab.f' + str(freq)
  (src_words, src_vocab_map, src_vocab_size) = text.get_vocab(src_file, src_vocab_file, freq, src_vocab_size, unk_symbol)
  
  if tgt_vocab_size>0:
    tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.' + str(tgt_vocab_size)
  elif freq>0:
    tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.f' + str(freq)  
  (tgt_words, tgt_vocab_map, tgt_vocab_size) = text.get_vocab(tgt_file, tgt_vocab_file, freq, tgt_vocab_size, unk_symbol)
  
  # process corpus
  line_id = 0
  debug = True
  bi_counts = {} # bi_counts[src_id][tgt_id]
  src_counts = {}
  tgt_counts = {}
  total_count = 0 # total alignment links
  for src_line in src_inf:
    src_line = src_line.strip()
    tgt_line = tgt_inf.readline().strip()
    src_tokens = re.split('\s+', src_line)
    tgt_tokens = re.split('\s+', tgt_line)
    if opt==1: # reversed alignment tgtId-srcId
      (t2s, s2t) = text.aggregate_alignments(align_inf.readline())
    else: # normal alignment srcId-tgtId
      (s2t, t2s) = text .aggregate_alignments(align_inf.readline())

    # process alignments
    for tgt_pos in t2s.keys():
      for src_pos in t2s[tgt_pos]:
        # same word
        src_token = src_tokens[src_pos]
        tgt_token = tgt_tokens[tgt_pos]
        if src_token in src_vocab_map and tgt_token in tgt_vocab_map: # both known
          src_id = src_vocab_map[src_token]
          tgt_id = tgt_vocab_map[tgt_token]
          if src_id not in bi_counts:
            bi_counts[src_id] = {}
            src_counts[src_id] = 0
          if tgt_id not in tgt_counts:
            tgt_counts[tgt_id] = 0
          if tgt_id not in bi_counts[src_id]:
            bi_counts[src_id][tgt_id] = 0
          
          # update
          bi_counts[src_id][tgt_id] += 1
          src_counts[src_id] += 1
          tgt_counts[tgt_id] += 1
          total_count += 1

    line_id = line_id + 1
    if (line_id % 100000 == 0):
      sys.stderr.write(' (%d) ' % line_id)
  sys.stderr.write('  num lines=%d, total links=%d\n' % (line_id, total_count))

  # output
  check_dir(out_prefix)
  dict_file = out_prefix + '.' + src_lang + '-' + tgt_lang + '.dict'
  dict_ouf = codecs.open(dict_file, 'w', 'utf-8')
  sys.stderr.write('# Output to %s*\n' % dict_file)

  # compute src_probs
  src_probs = {}
  for src_id in src_counts.keys():
    src_probs[src_id] = float(src_counts[src_id])/float(total_count)

  # compute tgt_probs
  tgt_probs = {}
  for tgt_id in tgt_counts.keys():
    tgt_probs[tgt_id] = float(tgt_counts[tgt_id])/float(total_count)

  # compute joint prob
  for src_id in bi_counts.keys():
    for tgt_id in bi_counts[src_id].keys():
      bi_count = bi_counts[src_id][tgt_id]
      if bi_count<10: continue
      p_src_given_tgt = float(bi_count)/float(tgt_counts[tgt_id])
      p_tgt_given_src = float(bi_count)/float(src_counts[src_id])
      
      # normalized pmi
      p_src_tgt = float(bi_count)/float(total_count) # joint
      p_src = src_probs[src_id]
      p_tgt = tgt_probs[tgt_id]
      pmi = math.log(p_src_tgt/(p_src*p_tgt))
      npmi = - pmi / math.log(p_src_tgt) 
  
      # print
      src_token = src_words[src_id]
      tgt_token = tgt_words[tgt_id]
      dict_ouf.write('%s %s %g %g %g %g %g\n' % (src_token, tgt_token, p_tgt_given_src, p_src_given_tgt, (p_src_given_tgt+p_tgt_given_src)/2, pmi, npmi))
      #dict_ouf.write('%s %s %g\n' % (src_token, tgt_token, (p_src_given_tgt+p_tgt_given_src)/2))

  #text.write_vocab(out_prefix + '.vocab.' + src_lang, src_words)
  #text.write_vocab(out_prefix + '.vocab.' + tgt_lang, tgt_words)

  src_inf.close()
  tgt_inf.close()
  align_inf.close()

  dict_ouf.close()
예제 #8
0
def process_files(in_prefix,
                  src_lang,
                  tgt_lang,
                  out_prefix,
                  freq,
                  opt,
                  src_vocab_size,
                  tgt_vocab_size,
                  unk_symbol='<unk>'):
    """
  """

    # input
    sys.stderr.write('# Input from %s.*\n' % (in_prefix))
    src_file = in_prefix + '.' + src_lang
    src_inf = codecs.open(src_file, 'r', 'utf-8')
    tgt_file = in_prefix + '.' + tgt_lang
    tgt_inf = codecs.open(tgt_file, 'r', 'utf-8')
    align_inf = codecs.open(in_prefix + '.align', 'r', 'utf-8')

    if src_vocab_size > 0:
        src_vocab_file = in_prefix + '.' + src_lang + '.vocab.' + str(
            src_vocab_size)
    elif freq > 0:
        src_vocab_file = in_prefix + '.' + src_lang + '.vocab.f' + str(freq)
    (src_words, src_vocab_map,
     src_vocab_size) = text.get_vocab(src_file, src_vocab_file, freq,
                                      src_vocab_size, unk_symbol)

    if tgt_vocab_size > 0:
        tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.' + str(
            tgt_vocab_size)
    elif freq > 0:
        tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.f' + str(freq)
    (tgt_words, tgt_vocab_map,
     tgt_vocab_size) = text.get_vocab(tgt_file, tgt_vocab_file, freq,
                                      tgt_vocab_size, unk_symbol)

    # process corpus
    line_id = 0
    debug = True
    bi_counts = {}  # bi_counts[src_id][tgt_id]
    src_counts = {}
    tgt_counts = {}
    total_count = 0  # total alignment links
    for src_line in src_inf:
        src_line = src_line.strip()
        tgt_line = tgt_inf.readline().strip()
        src_tokens = re.split('\s+', src_line)
        tgt_tokens = re.split('\s+', tgt_line)
        if opt == 1:  # reversed alignment tgtId-srcId
            (t2s, s2t) = text.aggregate_alignments(align_inf.readline())
        else:  # normal alignment srcId-tgtId
            (s2t, t2s) = text.aggregate_alignments(align_inf.readline())

        # process alignments
        for tgt_pos in t2s.keys():
            for src_pos in t2s[tgt_pos]:
                # same word
                src_token = src_tokens[src_pos]
                tgt_token = tgt_tokens[tgt_pos]
                if src_token in src_vocab_map and tgt_token in tgt_vocab_map:  # both known
                    src_id = src_vocab_map[src_token]
                    tgt_id = tgt_vocab_map[tgt_token]
                    if src_id not in bi_counts:
                        bi_counts[src_id] = {}
                        src_counts[src_id] = 0
                    if tgt_id not in tgt_counts:
                        tgt_counts[tgt_id] = 0
                    if tgt_id not in bi_counts[src_id]:
                        bi_counts[src_id][tgt_id] = 0

                    # update
                    bi_counts[src_id][tgt_id] += 1
                    src_counts[src_id] += 1
                    tgt_counts[tgt_id] += 1
                    total_count += 1

        line_id = line_id + 1
        if (line_id % 100000 == 0):
            sys.stderr.write(' (%d) ' % line_id)
    sys.stderr.write('  num lines=%d, total links=%d\n' %
                     (line_id, total_count))

    # output
    check_dir(out_prefix)
    dict_file = out_prefix + '.' + src_lang + '-' + tgt_lang + '.dict'
    dict_ouf = codecs.open(dict_file, 'w', 'utf-8')
    sys.stderr.write('# Output to %s*\n' % dict_file)

    # compute src_probs
    src_probs = {}
    for src_id in src_counts.keys():
        src_probs[src_id] = float(src_counts[src_id]) / float(total_count)

    # compute tgt_probs
    tgt_probs = {}
    for tgt_id in tgt_counts.keys():
        tgt_probs[tgt_id] = float(tgt_counts[tgt_id]) / float(total_count)

    # compute joint prob
    for src_id in bi_counts.keys():
        for tgt_id in bi_counts[src_id].keys():
            bi_count = bi_counts[src_id][tgt_id]
            if bi_count < 10: continue
            p_src_given_tgt = float(bi_count) / float(tgt_counts[tgt_id])
            p_tgt_given_src = float(bi_count) / float(src_counts[src_id])

            # normalized pmi
            p_src_tgt = float(bi_count) / float(total_count)  # joint
            p_src = src_probs[src_id]
            p_tgt = tgt_probs[tgt_id]
            pmi = math.log(p_src_tgt / (p_src * p_tgt))
            npmi = -pmi / math.log(p_src_tgt)

            # print
            src_token = src_words[src_id]
            tgt_token = tgt_words[tgt_id]
            dict_ouf.write(
                '%s %s %g %g %g %g %g\n' %
                (src_token, tgt_token, p_tgt_given_src, p_src_given_tgt,
                 (p_src_given_tgt + p_tgt_given_src) / 2, pmi, npmi))
            #dict_ouf.write('%s %s %g\n' % (src_token, tgt_token, (p_src_given_tgt+p_tgt_given_src)/2))

    #text.write_vocab(out_prefix + '.vocab.' + src_lang, src_words)
    #text.write_vocab(out_prefix + '.vocab.' + tgt_lang, tgt_words)

    src_inf.close()
    tgt_inf.close()
    align_inf.close()

    dict_ouf.close()