def test_on_validation(path_model, n_combination_question = 3, n_combination_answer = 3, n_word_question = 5):
    model = gensim.models.Word2Vec.load(path_model)
    path_train = 'data/validation_set.tsv'
    n_combination_question = 4
    n_combination_answer = 3
    n_word_question = 5
    n_total = 0
    n_correct = 0
    #set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for')
    d_word_count = util.load_d_word_count()
    for index, line in enumerate(open(path_train)):
        n_total += 1
        if index == 0:
            continue
        else:
            lst = line.lower().strip('\n').split('\t')
            question = lst[1].split(' ')
            lst_choice = [l.split(' ') for l in lst[2:]]
            #question_u = list(set(question).difference(set_stopword))
            d = {}
            for word in question:
                #word = word.strip('?').strip('.').strip(',').strip('!')
                word = util.norm_word(word)
                if d_word_count.has_key(word):
                    d[word] = d_word_count[word]
                else:
                    d[word] = 0
            sort = sorted(d.iteritems(), key = lambda dd : dd[1])
            question_u = [s[0] for s in sort[:n_word_question]]
            lst_com_q = util.combination_index(len(question_u), n_combination_question)
            max = -1000000
            answer_p = ''
            for com_q in lst_com_q:
                vec_q = np.sum([get_vector_from_model(model, question_u[i]) for i in com_q], axis = 0)
                for i_choice in range(4):
                    choice_u =list(set(lst_choice[i_choice]))
                    lst_com_choice = util.combination_index(len(choice_u), n_combination_answer)
                    for com_c in lst_com_choice:
                        vec_c = np.sum([get_vector_from_model(model, choice_u[i]) for i in com_c], axis = 0)
                        score = vec_q.dot(vec_c)
                        if score > max:
                            max = score
                            if i_choice == 0:
                                answer_p = 'A'
                            elif i_choice == 1:
                                answer_p = 'B'
                            elif i_choice == 2:
                                answer_p = 'C'
                            elif i_choice == 3:
                                answer_p = 'D'
            print "%s,%s" % (lst[0], answer_p)
Exemplo n.º 2
0
def get_max_occurence(lst_set_sentence, question, lst_choice, d_word_count,
                      set_stopword, n_word_question, n_combination_question,
                      n_combination_answer):
    answer_p = ''
    MAX = -1
    lst_word_focus_q = []
    lst_word_focus_c = []
    lst_word_question_u = list(set(map(util.norm_word, question.split())))
    d = {}
    for word in question:
        #word = word.strip('?').strip('.').strip(',').strip('!')
        word = util.norm_word(word)
        if d_word_count.has_key(word):
            d[word] = d_word_count[word]
        else:
            d[word] = 0
    sort = sorted(d.iteritems(), key=lambda dd: dd[1])
    question_u = [s[0] for s in sort[:n_word_question]]
    lst_com_q = util.combination_index(len(lst_word_question_u),
                                       n_combination_question)
    for com_q in lst_com_q:
        lst_word_question = [lst_word_question_u[i] for i in com_q]
        for index_c, choice in enumerate(lst_choice):
            # Filter stop words in choice in order to prevent them from calculating cooccurence.
            lst_word_choice_u = list(
                set(map(util.norm_word,
                        choice.split())).difference(set_stopword))
            lst_com_c = util.combination_index(len(lst_word_choice_u),
                                               n_combination_answer)
            for com_c in lst_com_c:
                lst_word_choice = [lst_word_choice_u[i] for i in com_c]
                n_cooccurence = get_cooccurence(lst_set_sentence,
                                                lst_word_question,
                                                lst_word_choice)
                if n_cooccurence > MAX:
                    MAX = n_cooccurence
                    lst_word_focus_q = lst_word_question
                    lst_word_focus_c = lst_word_choice
                    if index_c == 0:
                        answer_p = 'A'
                    elif index_c == 1:
                        answer_p = 'B'
                    elif index_c == 2:
                        answer_p = 'C'
                    elif index_c == 3:
                        answer_p = 'D'

    return answer_p, MAX, lst_word_focus_q, lst_word_focus_c
def word_count():
    PATTERN_TXT = re.compile(r'.*txt')
    d_word_count = {}

    dir = 'data/ck12-multi-line-txt/'
    for fn in os.listdir(dir):
        if PATTERN_TXT.match(fn):
            fn = dir + '/' + fn
            for line in open(fn):
                lst = line.strip('\n').lower().split(' ')
                for word in lst:
                    word = util.norm_word(word)
                    d_word_count.setdefault(word, 0)
                    d_word_count[word] += 1

    sort = sorted(d_word_count.iteritems(), key = lambda dd : dd[1], reverse = True)
    file = open('data/ck-12-word-count.txt', 'w')
    for kv in sort:
        file.write("%s\t%d\n" % (kv[0], kv[1]))
    file.close()
def word_count():
    PATTERN_TXT = re.compile(r'.*txt')
    d_word_count = {}

    dir = 'data/ck12-multi-line-txt/'
    for fn in os.listdir(dir):
        if PATTERN_TXT.match(fn):
            fn = dir + '/' + fn
            for line in open(fn):
                lst = line.strip('\n').lower().split(' ')
                for word in lst:
                    word = util.norm_word(word)
                    d_word_count.setdefault(word, 0)
                    d_word_count[word] += 1

    sort = sorted(d_word_count.iteritems(), key=lambda dd: dd[1], reverse=True)
    file = open('data/ck-12-word-count.txt', 'w')
    for kv in sort:
        file.write("%s\t%d\n" % (kv[0], kv[1]))
    file.close()
def get_max_occurence(lst_set_sentence, question, lst_choice, d_word_count, set_stopword, n_word_question, n_combination_question, n_combination_answer):
    answer_p = ''
    MAX = -1
    lst_word_focus_q = []
    lst_word_focus_c = []
    lst_word_question_u = list(set(map(util.norm_word, question.split())))
    d = {}
    for word in question:
        #word = word.strip('?').strip('.').strip(',').strip('!')
        word = util.norm_word(word)
        if d_word_count.has_key(word):
            d[word] = d_word_count[word]
        else:
            d[word] = 0
    sort = sorted(d.iteritems(), key = lambda dd : dd[1])
    question_u = [s[0] for s in sort[:n_word_question]]
    lst_com_q = util.combination_index(len(lst_word_question_u), n_combination_question)
    for com_q in lst_com_q:
        lst_word_question = [lst_word_question_u[i] for i in com_q]
        for index_c, choice in enumerate(lst_choice):
            # Filter stop words in choice in order to prevent them from calculating cooccurence.
            lst_word_choice_u = list(set(map(util.norm_word, choice.split())).difference(set_stopword))
            lst_com_c = util.combination_index(len(lst_word_choice_u), n_combination_answer)
            for com_c in lst_com_c:
                lst_word_choice = [lst_word_choice_u[i] for i in com_c]
                n_cooccurence = get_cooccurence(lst_set_sentence, lst_word_question, lst_word_choice)
                if n_cooccurence > MAX :
                    MAX = n_cooccurence
                    lst_word_focus_q = lst_word_question
                    lst_word_focus_c = lst_word_choice
                    if index_c == 0:
                        answer_p = 'A'
                    elif index_c == 1:
                        answer_p = 'B'
                    elif index_c == 2:
                        answer_p = 'C'
                    elif index_c == 3:
                        answer_p = 'D'

    return answer_p, MAX, lst_word_focus_q, lst_word_focus_c
Exemplo n.º 6
0
def test_on_validation(path_model,
                       n_combination_question=3,
                       n_combination_answer=3,
                       n_word_question=5):
    model = gensim.models.Word2Vec.load(path_model)
    path_train = 'data/validation_set.tsv'
    n_combination_question = 4
    n_combination_answer = 3
    n_word_question = 5
    n_total = 0
    n_correct = 0
    #set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for')
    d_word_count = util.load_d_word_count()
    for index, line in enumerate(open(path_train)):
        n_total += 1
        if index == 0:
            continue
        else:
            lst = line.lower().strip('\n').split('\t')
            question = lst[1].split(' ')
            lst_choice = [l.split(' ') for l in lst[2:]]
            #question_u = list(set(question).difference(set_stopword))
            d = {}
            for word in question:
                #word = word.strip('?').strip('.').strip(',').strip('!')
                word = util.norm_word(word)
                if d_word_count.has_key(word):
                    d[word] = d_word_count[word]
                else:
                    d[word] = 0
            sort = sorted(d.iteritems(), key=lambda dd: dd[1])
            question_u = [s[0] for s in sort[:n_word_question]]
            lst_com_q = util.combination_index(len(question_u),
                                               n_combination_question)
            max = -1000000
            answer_p = ''
            for com_q in lst_com_q:
                vec_q = np.sum([
                    get_vector_from_model(model, question_u[i]) for i in com_q
                ],
                               axis=0)
                for i_choice in range(4):
                    choice_u = list(set(lst_choice[i_choice]))
                    lst_com_choice = util.combination_index(
                        len(choice_u), n_combination_answer)
                    for com_c in lst_com_choice:
                        vec_c = np.sum([
                            get_vector_from_model(model, choice_u[i])
                            for i in com_c
                        ],
                                       axis=0)
                        score = vec_q.dot(vec_c)
                        if score > max:
                            max = score
                            if i_choice == 0:
                                answer_p = 'A'
                            elif i_choice == 1:
                                answer_p = 'B'
                            elif i_choice == 2:
                                answer_p = 'C'
                            elif i_choice == 3:
                                answer_p = 'D'
            print "%s,%s" % (lst[0], answer_p)