示例#1
0
    def __init__(self):
        print 'init...'
        self.tag_index = {1: 'NN', 2: 'VB', 4: 'RB', 5: 'JJ'}
        self.stemmer = Stemmer()
        self.calculator = WordDistance()

        try:
            print 'loading word category...'
            with open('./quiz_data/news_data4.txt',
                      'r') as file1, open('./quiz_data/strongwords.txt',
                                          'r') as file2:
                raw_super_dict = pickle.load(
                    file1)  # {category: {pos_tag: {word, freq}}}
                raw_strong_dict = pickle.load(file2)

            self.super_dict = raw_super_dict
            self.strong_dict = raw_strong_dict
            self.all_english_distractors = self.convert_dict_without_category(
                self.super_dict)

            self.english_word_tags, self.chinese_word_tags, self.most_frequent_translation = self.load_word_tags(
                './quiz_data/english_chinese_translations.csv')

        except IOError as e:
            print "[Error in MCQGenerator: while opening files]"
def compute(translate_file, sim_file):
	print 'start'
 	words = load_word(translate_file)
 	print 'words ' + str(len(words))

	calculator = WordDistance()

 	fw = open(sim_file, 'w')

	i = 0
	size = len(words)
	while i<size:
		if i%50==0:
	 		print i
	 		fw.flush()
	 	w1 = words[i]
	 	j = i+1
	 	while j<size:
	 		w2 = words[j]
	 		sim = calculator.get_lin_distance(w1, w2)
	 		if sim>=0.1 and sim<1:
	 			fw.write(w1 + ',' + w2 + ',' + str(sim) +'\n')
	 		j += 1
	 	i += 1
	fw.close()
示例#3
0
 def __init__(self):
     try:
         with open('./public/MCQ Generation/news_data4.txt', 'r') as file1, open('./public/MCQ Generation/strongwords.txt', 'r') as file2:
             self.super_dict = pickle.load(file1)
             self.strong_dict = pickle.load(file2)
             self.stemmer = Stemmer()
             self.calculator = WordDistance()
     except IOError as e:
         print "[Error in MCQGenerator: while opening files]"
示例#4
0
class MCQGenerator(object):

    def __init__(self):
        try:
            with open('./public/MCQ Generation/news_data4.txt', 'r') as file1, open('./public/MCQ Generation/strongwords.txt', 'r') as file2:
                self.super_dict = pickle.load(file1)
                self.strong_dict = pickle.load(file2)
                self.stemmer = Stemmer()
                self.calculator = WordDistance()
        except IOError as e:
            print "[Error in MCQGenerator: while opening files]"

    # return the correct POS tag information
    def get_target_tag(self, input_sentence, input_word):
        text = nltk.word_tokenize(input_sentence) # the standard pos tagger
        pos_result =  nltk.pos_tag(text)
        for i in pos_result:
            if i[0] == input_word:
                return i[1]

    def get_similarity(self, category, word, tag, number):
                """ Return the specified number of words with good similarity"""
        similar_list = []
        word1 = self.stemmer.stem(word, tag)
        word_list = self.super_dict[category][tag].keys()
        random.shuffle(word_list)
        

        for candidate in word_list:
            word2 = self.stemmer.stem(candidate, tag)
            calculator = WordDistance()
            distance = calculator.get_distance(word1, word2)
            print "Distance is: " + str(distance) + " " + candidate + " " + word
            # JJ is weird for distance program
            if tag == "JJ":
                if distance == 1:
                    similar_list.append(candidate)
                    # print "inside here: " + candidate
                    if len(similar_list) > number-1:
                        return similar_list
            else:
                if distance > 0.1 and distance < 1: # eliminate in the same synsets
                    similar_list.append(candidate)
                    # print "inside here: " + candidate
                    if len(similar_list) > number-1:
                        return similar_list


        return similar_list
示例#5
0
    def categorize(self):

        # the system supports current categories
        category_list = [
            "Technology", "Entertainment", "Finance", "Travel", "Health",
            "World", "Sports"
        ]

        dictionary_data = {}
        dictionary_data["Technology"] = {}
        dictionary_data["Entertainment"] = {}
        dictionary_data["Finance"] = {}
        dictionary_data["Travel"] = {}
        dictionary_data["Health"] = {}
        dictionary_data["World"] = {}
        dictionary_data["Sports"] = {}

        # my magic two stemmer and distance calculator
        calculator = WordDistance()
        stemmer = Stemmer()

        # read through all the articles in each folder
        for category in category_list:
            file_list = glob.glob("Corpus/" + category + "/*.txt")
            # begin to read file one by one
            for file_name in file_list:
                #print "Processing file: " + file_name
                words_in_this_file = []
                with open(file_name) as f:
                    for line in f:
                        line = line.replace("\n",
                                            "").decode('utf-8').encode('utf-8')
                        word_list = line.split(" ")
                        for raw_word in word_list:
                            # remove all non word and change them to
                            word = re.sub("[^A-Za-z]", "", raw_word).lower()
                            # we are calculating frequency per document
                            if word in words_in_this_file:
                                continue
                            if word not in dictionary_data[category]:
                                dictionary_data[category][word] = 1
                            else:
                                dictionary_data[category][word] += 1
                            words_in_this_file.append(
                                word
                            )  # append this word to this document's word list

                #print "Finished processing " + word + " in " + category

        # calculating average weight for each word
        weight_sum = {}
        for category in dictionary_data:
            for word in dictionary_data[category]:
                if word not in weight_sum:
                    weight_sum[word] = 0
                else:
                    weight_sum[word] += dictionary_data[category][word]
        average_weight = {}
        for word in weight_sum:
            average_weight[word] = weight_sum[word] * 1.0 / 7

        # getting the words whose weight is above the average weight, by using 5 it should be around 10
        satisfied_words = {}
        for target_category in category_list:
            satisfied_words[target_category] = []
            for word in dictionary_data[target_category]:
                try:
                    if dictionary_data[target_category][
                            word] > average_weight[word] + 5:
                        satisfied_words[target_category].append(word)
                except:
                    continue

        # to be add. eliminate those non-words

        # store the result into a local dictionary
        print dictionary_data
        file1 = open('news_data_count_1.txt', 'w')
        pickle.dump(dictionary_data, file1)
        file1.close()
示例#6
0
category_list = [
    "Technology", "Entertainment", "Finance", "Travel", "Health", "World",
    "Sports"
]

dictionary_data = {}
dictionary_data["Technology"] = {}
dictionary_data["Entertainment"] = {}
dictionary_data["Finance"] = {}
dictionary_data["Travel"] = {}
dictionary_data["Health"] = {}
dictionary_data["World"] = {}
dictionary_data["Sports"] = {}

# my magic two stemmer and distance calculator
calculator = WordDistance()
stemmer = Stemmer()

# read through all the articles in each folder
for category in category_list:
    file_list = glob.glob("Corpus/" + category + "/*.txt")
    # begin to read file one by one
    for file_name in file_list:
        #print "Processing file: " + file_name
        with open(file_name) as f:
            for line in f:
                line = line.replace("\n", "").decode('utf-8').encode('utf-8')
                word_list = line.split(" ")
                for word in word_list:
                    if word not in dictionary_data[category]:
                        dictionary_data[category][word] = 1