Пример #1
0
    def displayFirstNExamples(self, n):
        if self.src_window < 0:
            return
        src_vocab, src_vocab_size = io_vocab.load_vocab(self.src_vocab_file)
        tgt_vocab, tgt_vocab_size = io_vocab.load_vocab(self.tgt_vocab_file)
        src_inverse_vocab = io_vocab.inverse_vocab(src_vocab)
        tgt_inverse_vocab = io_vocab.inverse_vocab(tgt_vocab)
        assert(n <= self.chunk_size)
        for i in xrange(n):
            example_x = self.data_x[i]
            example_y = self.data_y[i]
            sent_idx = example_x[-1]
            src_sent_vector = self.data_sm[sent_idx]
            src_sent_length = src_sent_vector[0]
            src_sent_vector = src_sent_vector[1:src_sent_length+1]
            src_window_vector = example_x[:self.src_window*2 + 1]
            tgt_gram_vector = example_x[self.src_window*2 + 1:-1]
            src_sent_words = io_vocab.getWordsFromIndeces(src_sent_vector, src_inverse_vocab, self.tgt_vocab_size)
            src_window_words = io_vocab.getWordsFromIndeces(src_window_vector, src_inverse_vocab, self.tgt_vocab_size)
            tgt_gram_words = io_vocab.getWordsFromIndeces(tgt_gram_vector, tgt_inverse_vocab, 0)

            output = ""
            count = 0
            for w in src_window_words:
                count += 1
                if count == self.src_window + 1:
                    output += "[" + w + "] "
                else:
                    output += w + " "
            output += "|| "
            output += " ".join(tgt_gram_words) + " "
            output += "===> " + tgt_inverse_vocab[example_y]
            output += " |||| "
            output += " ".join(src_sent_words) + " "
            print output
Пример #2
0
def get_tfidf(input_file, vocab_file):
  (vocab_map, vocab_size) = io_vocab.load_vocab(vocab_file)
  print vocab_map
  exit()
  tf_map, df_map = initialize_tfidf(vocab_map)
  infile = open(input_file, 'r')
  for line in infile:
    update_tfidf(line, tf_map, df_map)
  tfidf_map = compute_tfidf(tf_map, df_map)
Пример #3
0
def get_tfidf(input_file, vocab_file):
    (vocab_map, vocab_size) = io_vocab.load_vocab(vocab_file)
    print vocab_map
    exit()
    tf_map, df_map = initialize_tfidf(vocab_map)
    infile = open(input_file, 'r')
    for line in infile:
        update_tfidf(line, tf_map, df_map)
    tfidf_map = compute_tfidf(tf_map, df_map)
Пример #4
0
    # all the global context (sentence) will be extended to this length to
    # ensure a uniform length
    max_src_sent_length = args.sentence_vector_length  # often around 100

    ####################################
    # LOAD VACAB
    # <words> is a list of words as in string
    # <vocab_map> is a dict mapping from word string to integer number of 1,2,...|Vocab|
    # <vocab_size> is the size of vocab == len(words) == len(vocab_map).

    src_vocab_file = args.vocab_file + '.' + \
        str(args.vocab_size) + '.vocab.' + src_lang
    tgt_vocab_file = args.vocab_file + '.' + \
        str(args.vocab_size) + '.vocab.' + tgt_lang
    (src_vocab_map, src_vocab_size) = io_vocab.load_vocab(
        src_vocab_file)
    (tgt_vocab_map, tgt_vocab_size) = io_vocab.load_vocab(
        tgt_vocab_file)

    #######################################
    # LOAD VALID NGRAMS, LOAD TEST NGRAMS
    # <valid_set_x> is a list of list, each of the list in valid_set_x is a n-gram of word, each word is represented by an integer
    #       for e.g. [128, 11, 13, 33, 17, 22, 0, 0, 11, 3]
    # <valid_set_y> is a list of integers each represent a next-word following the list of word in valid_set_x

    src_valid_file = args.valid_file + '.' + \
        str(args.vocab_size) + '.id.' + src_lang
    tgt_valid_file = args.valid_file + '.' + \
        str(args.vocab_size) + '.id.' + tgt_lang
    # valid_set_sm is the sentence matrix
    (valid_set_x, valid_set_y, valid_set_sm) = io_read_ngram.get_all_joint_ngrams_with_src_global_matrix(src_valid_file, tgt_valid_file, args.valid_file + '.align',