예제 #1
0
def get_char_pairs(question_id, user_id):
    '''
    Get all possible char pairs between words in question_id, user_id
    '''
    user_chars = simp.get_user_char(simp.users[user_id])
    ques_chars = simp.get_question_char(simp.questions[question_id])
    pairs = []
    for user_char in user_chars:
        for ques_char in ques_chars:
            pairs.append((user_char, ques_char))

    return pairs
def get_ques_user_similarity(user_id, ques_id):
    '''
    :param user_id: id of the user
    :param ques_id: id of the question
    :return: L1 and L2 distance between user and question data
    '''
    #TODO: remove the interest tags while comparing
    L1_dist = 0.0
    L2_dist = 0.0
    q_chars = Counter(simp.get_question_char(simp.questions[ques_id]))
    u_chars = Counter(simp.get_user_char(simp.users[user_id]))

    q_vec = simp.get_one_feature(q_chars, q_chars + u_chars)
    u_vec = simp.get_one_feature(u_chars, q_chars + u_chars)

    L1_dist += mltrio_utils.get_L1_dist(q_vec, u_vec)
    L2_dist += mltrio_utils.get_L2_dist(q_vec, u_vec)

    return L1_dist, L2_dist
def main_fn():
    labels = []
    char_features = []
    word_features = []
    tag_features = []
    
    best_words = pickle.load(open("./feature/best_words.p", "r"))
    best_chars = pickle.load(open("./feature/best_chars.p", "r"))
    
    best_words_dict = dict([ (w,idx) for idx,w in enumerate(best_words)])
    best_chars_dict = dict([ (c,idx) for idx,c in enumerate(best_chars)])
    
    print "best_words", len(best_words)
    print "best_chars", len(best_chars)
    
    with open("../../bytecup2016data/invited_info_train_tr.txt") as f:
        training_data = f.readline().strip().split("\t")
        while training_data and len(training_data) == 3 :
            
            labels.append(training_data[2])
            
            question = simp.questions[training_data[0]]
            user = simp.users[training_data[1]]
            
            char_feature = simp.get_user_char(user)
            char_feature.extend(simp.get_question_char(question))
            # REMOVE LESS FREQUENT ITEMS
            char_feature = [best_chars_dict[c] for c in char_feature if c in best_chars and c != '' ]
            char_features.append(char_feature)
            
            word_feature = simp.get_user_words(user)
            word_feature.extend(simp.get_question_words(question))
            # REMOVE LESS FREQUENT ITEMS
            word_feature = [best_words_dict[w] for w in word_feature if w in best_words and w != '' ]
            word_features.append(word_feature)
            
            tag_feature = simp.get_user_tag(user)
            tag_feature.extend(simp.get_question_tag(question))
            tag_features.append([int(t) for t in tag_feature])
            
            if len(char_features) % 1000 == 0:
                print len(char_features)
                
            training_data = f.readline().strip().split("\t")
            
            
        
    print "features", len(char_features)
    print "labels", len(labels)
        
    pickle.dump(char_features, open("./feature/char_features.p", "wb") )
    pickle.dump(word_features, open("./feature/word_features.p", "wb") )
    pickle.dump(tag_features, open("./feature/tag_features.p", "wb") )
    pickle.dump(labels, open("./feature/labels.p", "wb") )
    
    print "max char_features", max([len(x) for x in char_features]) 
    print "max word_feature", max([len(x) for x in word_features]) 
    print "max tag_features", max([len(x) for x in tag_features]) 
    
#     max char_features 123
#     max word_feature 52
#     max tag_features 8
    print "done"