Exemplo n.º 1
0
    def __init__(self):
        print 'init...'
        self.tag_index = {1: 'NN', 2: 'VB', 4: 'RB', 5: 'JJ'}
        self.stemmer = Stemmer()
        self.calculator = WordDistance()

        try:
            print 'loading word category...'
            with open('./quiz_data/news_data4.txt',
                      'r') as file1, open('./quiz_data/strongwords.txt',
                                          'r') as file2:
                raw_super_dict = pickle.load(
                    file1)  # {category: {pos_tag: {word, freq}}}
                raw_strong_dict = pickle.load(file2)

            self.super_dict = raw_super_dict
            self.strong_dict = raw_strong_dict
            self.all_english_distractors = self.convert_dict_without_category(
                self.super_dict)

            self.english_word_tags, self.chinese_word_tags, self.most_frequent_translation = self.load_word_tags(
                './quiz_data/english_chinese_translations.csv')

        except IOError as e:
            print "[Error in MCQGenerator: while opening files]"
Exemplo n.º 2
0
def compute(translate_file, sim_file):
	print 'start'
 	words = load_word(translate_file)
 	print 'words ' + str(len(words))

	calculator = WordDistance()

 	fw = open(sim_file, 'w')

	i = 0
	size = len(words)
	while i<size:
		if i%50==0:
	 		print i
	 		fw.flush()
	 	w1 = words[i]
	 	j = i+1
	 	while j<size:
	 		w2 = words[j]
	 		sim = calculator.get_lin_distance(w1, w2)
	 		if sim>=0.1 and sim<1:
	 			fw.write(w1 + ',' + w2 + ',' + str(sim) +'\n')
	 		j += 1
	 	i += 1
	fw.close()
Exemplo n.º 3
0
    def get_similarity(self, category, word, tag, number):
        """ Return the specified number of words with good similarity"""
        similar_list = []
        word1 = self.stemmer.stem(word, tag)
        word_list = self.super_dict[category][tag].keys()
        random.shuffle(word_list)
        

        for candidate in word_list:
            word2 = self.stemmer.stem(candidate, tag)
            calculator = WordDistance()
            distance = calculator.get_distance(word1, word2)
            print "Distance is: " + str(distance) + " " + candidate + " " + word
            # JJ is weird for distance program
            if tag == "JJ":
                if distance == 1:
                    similar_list.append(candidate)
                    # print "inside here: " + candidate
                    if len(similar_list) > number-1:
                        return similar_list
            else:
                if distance > 0.1 and distance < 1: # eliminate in the same synsets
                    similar_list.append(candidate)
                    # print "inside here: " + candidate
                    if len(similar_list) > number-1:
                        return similar_list


        return similar_list
Exemplo n.º 4
0
 def __init__(self):
     try:
         with open('./public/MCQ Generation/news_data4.txt', 'r') as file1, open('./public/MCQ Generation/strongwords.txt', 'r') as file2:
             self.super_dict = pickle.load(file1)
             self.strong_dict = pickle.load(file2)
             self.stemmer = Stemmer()
             self.calculator = WordDistance()
     except IOError as e:
         print "[Error in MCQGenerator: while opening files]"
Exemplo n.º 5
0
class MCQGenerator(object):

    def __init__(self):
        try:
            with open('./public/MCQ Generation/news_data4.txt', 'r') as file1, open('./public/MCQ Generation/strongwords.txt', 'r') as file2:
                self.super_dict = pickle.load(file1)
                self.strong_dict = pickle.load(file2)
                self.stemmer = Stemmer()
                self.calculator = WordDistance()
        except IOError as e:
            print "[Error in MCQGenerator: while opening files]"

    # return the correct POS tag information
    def get_target_tag(self, input_sentence, input_word):
        text = nltk.word_tokenize(input_sentence) # the standard pos tagger
        pos_result =  nltk.pos_tag(text)
        for i in pos_result:
            if i[0] == input_word:
                return i[1]

    def get_similarity(self, category, word, tag, number):
                """ Return the specified number of words with good similarity"""
        similar_list = []
        word1 = self.stemmer.stem(word, tag)
        word_list = self.super_dict[category][tag].keys()
        random.shuffle(word_list)
        

        for candidate in word_list:
            word2 = self.stemmer.stem(candidate, tag)
            calculator = WordDistance()
            distance = calculator.get_distance(word1, word2)
            print "Distance is: " + str(distance) + " " + candidate + " " + word
            # JJ is weird for distance program
            if tag == "JJ":
                if distance == 1:
                    similar_list.append(candidate)
                    # print "inside here: " + candidate
                    if len(similar_list) > number-1:
                        return similar_list
            else:
                if distance > 0.1 and distance < 1: # eliminate in the same synsets
                    similar_list.append(candidate)
                    # print "inside here: " + candidate
                    if len(similar_list) > number-1:
                        return similar_list


        return similar_list
Exemplo n.º 6
0
 def __init__(self):
     try:
         with open('./public/MCQ Generation/news_data4.txt', 'r') as file1, open('./public/MCQ Generation/strongwords.txt', 'r') as file2:
             self.super_dict = pickle.load(file1)
             self.strong_dict = pickle.load(file2)
             self.stemmer = Stemmer()
             self.calculator = WordDistance()
     except IOError as e:
         print "[Error in MCQGenerator: while opening files]"
Exemplo n.º 7
0
    def categorize(self):

        # the system supports current categories
        category_list = [
            "Technology", "Entertainment", "Finance", "Travel", "Health",
            "World", "Sports"
        ]

        dictionary_data = {}
        dictionary_data["Technology"] = {}
        dictionary_data["Entertainment"] = {}
        dictionary_data["Finance"] = {}
        dictionary_data["Travel"] = {}
        dictionary_data["Health"] = {}
        dictionary_data["World"] = {}
        dictionary_data["Sports"] = {}

        # my magic two stemmer and distance calculator
        calculator = WordDistance()
        stemmer = Stemmer()

        # read through all the articles in each folder
        for category in category_list:
            file_list = glob.glob("Corpus/" + category + "/*.txt")
            # begin to read file one by one
            for file_name in file_list:
                #print "Processing file: " + file_name
                words_in_this_file = []
                with open(file_name) as f:
                    for line in f:
                        line = line.replace("\n",
                                            "").decode('utf-8').encode('utf-8')
                        word_list = line.split(" ")
                        for raw_word in word_list:
                            # remove all non word and change them to
                            word = re.sub("[^A-Za-z]", "", raw_word).lower()
                            # we are calculating frequency per document
                            if word in words_in_this_file:
                                continue
                            if word not in dictionary_data[category]:
                                dictionary_data[category][word] = 1
                            else:
                                dictionary_data[category][word] += 1
                            words_in_this_file.append(
                                word
                            )  # append this word to this document's word list

                #print "Finished processing " + word + " in " + category

        # calculating average weight for each word
        weight_sum = {}
        for category in dictionary_data:
            for word in dictionary_data[category]:
                if word not in weight_sum:
                    weight_sum[word] = 0
                else:
                    weight_sum[word] += dictionary_data[category][word]
        average_weight = {}
        for word in weight_sum:
            average_weight[word] = weight_sum[word] * 1.0 / 7

        # getting the words whose weight is above the average weight, by using 5 it should be around 10
        satisfied_words = {}
        for target_category in category_list:
            satisfied_words[target_category] = []
            for word in dictionary_data[target_category]:
                try:
                    if dictionary_data[target_category][
                            word] > average_weight[word] + 5:
                        satisfied_words[target_category].append(word)
                except:
                    continue

        # to be add. eliminate those non-words

        # store the result into a local dictionary
        print dictionary_data
        file1 = open('news_data_count_1.txt', 'w')
        pickle.dump(dictionary_data, file1)
        file1.close()
Exemplo n.º 8
0
category_list = [
    "Technology", "Entertainment", "Finance", "Travel", "Health", "World",
    "Sports"
]

dictionary_data = {}
dictionary_data["Technology"] = {}
dictionary_data["Entertainment"] = {}
dictionary_data["Finance"] = {}
dictionary_data["Travel"] = {}
dictionary_data["Health"] = {}
dictionary_data["World"] = {}
dictionary_data["Sports"] = {}

# my magic two stemmer and distance calculator
calculator = WordDistance()
stemmer = Stemmer()

# read through all the articles in each folder
for category in category_list:
    file_list = glob.glob("Corpus/" + category + "/*.txt")
    # begin to read file one by one
    for file_name in file_list:
        #print "Processing file: " + file_name
        with open(file_name) as f:
            for line in f:
                line = line.replace("\n", "").decode('utf-8').encode('utf-8')
                word_list = line.split(" ")
                for word in word_list:
                    if word not in dictionary_data[category]:
                        dictionary_data[category][word] = 1
Exemplo n.º 9
0
class QuizGenerator(object):
    MIN_SIM = 0.5

    def __init__(self):
        print 'init...'
        self.tag_index = {1: 'NN', 2: 'VB', 4: 'RB', 5: 'JJ'}
        self.stemmer = Stemmer()
        self.calculator = WordDistance()

        try:
            print 'loading word category...'
            with open('./quiz_data/news_data4.txt',
                      'r') as file1, open('./quiz_data/strongwords.txt',
                                          'r') as file2:
                raw_super_dict = pickle.load(
                    file1)  # {category: {pos_tag: {word, freq}}}
                raw_strong_dict = pickle.load(file2)

            self.super_dict = raw_super_dict
            self.strong_dict = raw_strong_dict
            self.all_english_distractors = self.convert_dict_without_category(
                self.super_dict)

            self.english_word_tags, self.chinese_word_tags, self.most_frequent_translation = self.load_word_tags(
                './quiz_data/english_chinese_translations.csv')

        except IOError as e:
            print "[Error in MCQGenerator: while opening files]"

    # similar to convert_dict but remove category information
    def convert_dict_without_category(self, raw_dict):
        new_dict = dict()
        for category, pos_tag_dict in raw_dict.iteritems():
            for pos_tag, word_dict in pos_tag_dict.iteritems():
                if pos_tag not in new_dict:
                    new_dict[pos_tag] = []
                for word, freq in word_dict.iteritems():
                    new_dict[pos_tag].append(word)
        for pos_tag, words in new_dict.iteritems():
            new_dict[pos_tag] = list(set(words))
        return new_dict

    # Load the list of english, chinese words and their pos tags from
    # the dump file of english_chinese_translations tables
    def load_word_tags(self, translation_file):
        print 'loading ' + translation_file
        english_word_tags = dict()
        chinese_word_tags = dict()
        most_frequent_translation = dict()

        with open(translation_file) as f:
            for line in f:
                line = line.strip()
                if line == '':
                    continue
                items = line.split(',')
                #if len(items)>3:
                #    print line
                english_word = items[len(items) - 3]
                chinese_word = items[len(items) - 2]
                pos_tag_idx = int(items[3].strip())
                rank = int(items[4].strip())

                if pos_tag_idx not in self.tag_index:
                    #print english_word + ' ' + chinese_word + ' ' + str(pos_tag_idx) + ' ' + str(rank)
                    continue
                else:
                    pos_tag = self.tag_index[pos_tag_idx]

                if english_word in english_word_tags:
                    english_word_tags[english_word].add(pos_tag)
                else:
                    english_word_tags[english_word] = set([pos_tag])

                if chinese_word in chinese_word_tags:
                    chinese_word_tags[chinese_word].add(pos_tag)
                else:
                    chinese_word_tags[chinese_word] = set([pos_tag])

                if rank == 0:
                    #print english_word+'-'+pos_tag + ': ' + chinese_word
                    most_frequent_translation[english_word + "-" +
                                              pos_tag] = chinese_word
        return english_word_tags, chinese_word_tags, most_frequent_translation

    # distractor can be either chinese or english
    # news_category denotes the topic of the news, e.g., technology, finance, etc
    # knowledge level decides the difficulty of distractors
    def get_distractors(self, word, word_pos, knowledge_level, news_category):
        knowledge_level = int(knowledge_level)
        print 'generating distractors...'
        if news_category in self.strong_dict:
            self.candidates = self.strong_dict[news_category][word_pos].keys()
        else:
            self.candidates = self.all_english_distractors[word_pos]

        if knowledge_level == 1:
            return self.get_easy_distractors(word, word_pos)
        elif knowledge_level == 2:
            return self.get_hard_distractors(word, word_pos, 'english')
        elif knowledge_level >= 3:
            return self.get_hard_distractors(word, word_pos, 'chinese')
        else:
            return []

    # The "easy" approach described in the paper:
    # Three distractors are the random words from the same news category and are in English
    def get_easy_distractors(self, word, word_pos):
        distractors_list = []

        nums = set()
        candidate_count = len(self.candidates)

        while len(distractors_list) < 3:
            n = random.randint(0, candidate_count - 1)
            if n not in nums:
                nums.add(n)
                candidate = self.candidates[n]
                if self.is_same_form(word, word_pos, candidate):
                    distractors_list.append(candidate)
        return distractors_list

    def get_hard_distractors(self, word, word_pos, distractor_lang):
        distractors_list = []

        nums = set()
        candidate_count = len(self.candidates)

        while len(distractors_list) < 3 or len(nums) >= candidate_count:
            n = random.randint(0, candidate_count - 1)
            if n not in nums:
                nums.add(n)
                candidate = self.candidates[n]
                if self.is_same_form(word, word_pos, candidate):
                    sim = self.get_similarity(word, candidate)
                    print candidate + ' ' + str(sim)
                    if sim >= QuizGenerator.MIN_SIM and sim < 1:  # semantically similar but not in the same synsets
                        if distractor_lang == 'chinese':
                            key = candidate + '-' + word_pos
                            if key in self.most_frequent_translation:
                                distractors_list.append(
                                    self.most_frequent_translation[key])
                        else:
                            distractors_list.append(candidate)
        return distractors_list

    # lin distance of two words
    def get_similarity(self, word, candidate):
        return self.calculator.get_lin_distance(word, candidate)

    # If the candidate has the same pos tag as the word
    def is_same_form(self, word, word_pos, candidate):
        return word != candidate and (candidate in english_word_tags) and (
            word_pos in english_word_tags[candidate])
Exemplo n.º 10
0
class MCQGenerator(object):

    def __init__(self):
        try:
            with open('./public/MCQ Generation/news_data4.txt', 'r') as file1, open('./public/MCQ Generation/strongwords.txt', 'r') as file2:
                self.super_dict = pickle.load(file1)
                self.strong_dict = pickle.load(file2)
                self.stemmer = Stemmer()
                self.calculator = WordDistance()
        except IOError as e:
            print "[Error in MCQGenerator: while opening files]"

    # return the correct POS tag information
    def get_target_tag(self, input_sentence, input_word):
        text = nltk.word_tokenize(input_sentence) # the standard pos tagger
        pos_result =  nltk.pos_tag(text)
        for i in pos_result:
            if i[0] == input_word:
                return i[1]

    def get_similarity(self, category, word, tag, number):
        """ Return the specified number of words with good similarity"""
        similar_list = []
        word1 = self.stemmer.stem(word, tag)
        word_list = self.super_dict[category][tag].keys()
        random.shuffle(word_list)
        

        for candidate in word_list:
            word2 = self.stemmer.stem(candidate, tag)
            calculator = WordDistance()
            distance = calculator.get_distance(word1, word2)
            print "Distance is: " + str(distance) + " " + candidate + " " + word
            # JJ is weird for distance program
            if tag == "JJ":
                if distance == 1:
                    similar_list.append(candidate)
                    # print "inside here: " + candidate
                    if len(similar_list) > number-1:
                        return similar_list
            else:
                if distance > 0.1 and distance < 1: # eliminate in the same synsets
                    similar_list.append(candidate)
                    # print "inside here: " + candidate
                    if len(similar_list) > number-1:
                        return similar_list


        return similar_list

    def get_distractors(self, category, sentence,understanding_level, word):
        # get the correct pos tag
        target_tag = self.get_target_tag(sentence, word)
        if understanding_level == 1:
            distractors_list = random.sample(self.super_dict[category][target_tag].keys(), 3)
            distractors_list.append(word)
            return distractors_list
        elif understanding_level == 2:
            distractors_list = random.sample(self.super_dict[category][target_tag].keys(), 2)
            
            similar_list = self.get_similarity(category, word, target_tag, 1)
            distractors_list.append(word)
            return distractors_list + similar_list
        elif understanding_level == 3:
            similar_list = self.get_similarity(category, word, target_tag, 3)
            similar_list.append(word)
            return similar_list

    def get_distractors(self, category, understanding_level, word):
        distractors_list = []
        
        tag = self.get_target_tag(word, word)

        shuffle(self.strong_dict[category])

        for dict_word in self.strong_dict[category]:
            similarity_score = self.get_similarity(dict_word, word, tag)
        
            if self.get_target_tag(dict_word, dict_word) == tag and dict_word != word and  similarity_score> 0:
                distractors_list.append(dict_word)
            if len(distractors_list) >=3:
                break

        distractors_list.append(word)
        return distractors_list

    def get_similarity(self, word1, word2, tag):
        distance = self.calculator.get_lin_distance(word1, word2)
        return distance