예제 #1
0
def preProBuildWordVocab(sentence_iterator, word_count_threshold = 5):
    # count up all word counts so that we can threshold
    # this shouldnt be too expensive of an operation
    print 'preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, )
    t0 = time.time()
    total_sent_len = len(sentence_iterator)
    total_world_cnt = 0
    word_counts = {}
    for sent in sentence_iterator:
        line_seg = conv_util.segment_sentence_cn(sent)
        for w in line_seg:
            word_counts[w] = word_counts.get(w, 0) + 1
    
    for w in word_counts.keys():
        total_world_cnt = total_world_cnt + word_counts[w]
        
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print 'filtered words from %d to %d in %.2fs' % (len(word_counts), len(vocab), time.time() - t0)

    # with K distinct words:
    # - there are K+1 possible inputs (START token and all the words)
    # - there are K+1 possible outputs (END token and all the words)
    # we use ixtoword to take predicted indeces and map them to words for output visualization
    # we use wordtoix to take raw words and get their index in word vector matrix
    ixtoword = {}
    #ixtoword[0] = '.'    # period at the end of the sentence. make first dimension be end token
    wordtoix = {}
    #wordtoix['#START#'] = 0 # make first vector be the start token
    ix = 1
    for w in vocab:
        wordtoix[w] = ix
        ixtoword[ix] = w
        ix += 1
    misc = {}
    misc['wordtoix'] = wordtoix
    misc['ixtoword'] = ixtoword
    misc['total_sent_len'] = total_sent_len
    misc['total_world_cnt'] = total_world_cnt
    misc['avg_snt_len'] = float(total_world_cnt)/total_sent_len
    print (total_sent_len,total_world_cnt,misc['avg_snt_len'])
    print 'vocab size  =  %d'%len(misc['ixtoword'])
    return misc
    ix_label_out_file = nlpcaffe_data_out_dir + file_name

    ix_label_out = []

    for i in xrange(0,len(label_list)):
        labels = label_list[i]
        
        label_split = labels.split('\t')
        if len(label_split)<2:
            ix_label_out.append("")
            continue
            
        #!!!
        each_label = label_split[1]
        token_label = conv_util.segment_sentence_cn(each_label)
        ix_token_label = []
        for each_token in token_label:
            if each_token in wordtoix:
                ix_token_label.append(str(wordtoix[each_token]))
        
        ix_token_label_str = " ".join(ix_token_label)
        
        ix_label_out.append(ix_token_label_str)
            
            

    print "set %s size is %d"%(file_name,len(ix_label_out))

    common_io.write_txt_lines(ix_label_out_file,ix_label_out)