def __init__(self): print 'init...' self.tag_index = {1: 'NN', 2: 'VB', 4: 'RB', 5: 'JJ'} self.stemmer = Stemmer() self.calculator = WordDistance() try: print 'loading word category...' with open('./quiz_data/news_data4.txt', 'r') as file1, open('./quiz_data/strongwords.txt', 'r') as file2: raw_super_dict = pickle.load( file1) # {category: {pos_tag: {word, freq}}} raw_strong_dict = pickle.load(file2) self.super_dict = raw_super_dict self.strong_dict = raw_strong_dict self.all_english_distractors = self.convert_dict_without_category( self.super_dict) self.english_word_tags, self.chinese_word_tags, self.most_frequent_translation = self.load_word_tags( './quiz_data/english_chinese_translations.csv') except IOError as e: print "[Error in MCQGenerator: while opening files]"
def compute(translate_file, sim_file): print 'start' words = load_word(translate_file) print 'words ' + str(len(words)) calculator = WordDistance() fw = open(sim_file, 'w') i = 0 size = len(words) while i<size: if i%50==0: print i fw.flush() w1 = words[i] j = i+1 while j<size: w2 = words[j] sim = calculator.get_lin_distance(w1, w2) if sim>=0.1 and sim<1: fw.write(w1 + ',' + w2 + ',' + str(sim) +'\n') j += 1 i += 1 fw.close()
def __init__(self): try: with open('./public/MCQ Generation/news_data4.txt', 'r') as file1, open('./public/MCQ Generation/strongwords.txt', 'r') as file2: self.super_dict = pickle.load(file1) self.strong_dict = pickle.load(file2) self.stemmer = Stemmer() self.calculator = WordDistance() except IOError as e: print "[Error in MCQGenerator: while opening files]"
class MCQGenerator(object): def __init__(self): try: with open('./public/MCQ Generation/news_data4.txt', 'r') as file1, open('./public/MCQ Generation/strongwords.txt', 'r') as file2: self.super_dict = pickle.load(file1) self.strong_dict = pickle.load(file2) self.stemmer = Stemmer() self.calculator = WordDistance() except IOError as e: print "[Error in MCQGenerator: while opening files]" # return the correct POS tag information def get_target_tag(self, input_sentence, input_word): text = nltk.word_tokenize(input_sentence) # the standard pos tagger pos_result = nltk.pos_tag(text) for i in pos_result: if i[0] == input_word: return i[1] def get_similarity(self, category, word, tag, number): """ Return the specified number of words with good similarity""" similar_list = [] word1 = self.stemmer.stem(word, tag) word_list = self.super_dict[category][tag].keys() random.shuffle(word_list) for candidate in word_list: word2 = self.stemmer.stem(candidate, tag) calculator = WordDistance() distance = calculator.get_distance(word1, word2) print "Distance is: " + str(distance) + " " + candidate + " " + word # JJ is weird for distance program if tag == "JJ": if distance == 1: similar_list.append(candidate) # print "inside here: " + candidate if len(similar_list) > number-1: return similar_list else: if distance > 0.1 and distance < 1: # eliminate in the same synsets similar_list.append(candidate) # print "inside here: " + candidate if len(similar_list) > number-1: return similar_list return similar_list
def categorize(self): # the system supports current categories category_list = [ "Technology", "Entertainment", "Finance", "Travel", "Health", "World", "Sports" ] dictionary_data = {} dictionary_data["Technology"] = {} dictionary_data["Entertainment"] = {} dictionary_data["Finance"] = {} dictionary_data["Travel"] = {} dictionary_data["Health"] = {} dictionary_data["World"] = {} dictionary_data["Sports"] = {} # my magic two stemmer and distance calculator calculator = WordDistance() stemmer = Stemmer() # read through all the articles in each folder for category in category_list: file_list = glob.glob("Corpus/" + category + "/*.txt") # begin to read file one by one for file_name in file_list: #print "Processing file: " + file_name words_in_this_file = [] with open(file_name) as f: for line in f: line = line.replace("\n", "").decode('utf-8').encode('utf-8') word_list = line.split(" ") for raw_word in word_list: # remove all non word and change them to word = re.sub("[^A-Za-z]", "", raw_word).lower() # we are calculating frequency per document if word in words_in_this_file: continue if word not in dictionary_data[category]: dictionary_data[category][word] = 1 else: dictionary_data[category][word] += 1 words_in_this_file.append( word ) # append this word to this document's word list #print "Finished processing " + word + " in " + category # calculating average weight for each word weight_sum = {} for category in dictionary_data: for word in dictionary_data[category]: if word not in weight_sum: weight_sum[word] = 0 else: weight_sum[word] += dictionary_data[category][word] average_weight = {} for word in weight_sum: average_weight[word] = weight_sum[word] * 1.0 / 7 # getting the words whose weight is above the average weight, by using 5 it should be around 10 satisfied_words = {} for target_category in category_list: satisfied_words[target_category] = [] for word in dictionary_data[target_category]: try: if dictionary_data[target_category][ word] > average_weight[word] + 5: satisfied_words[target_category].append(word) except: continue # to be add. eliminate those non-words # store the result into a local dictionary print dictionary_data file1 = open('news_data_count_1.txt', 'w') pickle.dump(dictionary_data, file1) file1.close()
category_list = [ "Technology", "Entertainment", "Finance", "Travel", "Health", "World", "Sports" ] dictionary_data = {} dictionary_data["Technology"] = {} dictionary_data["Entertainment"] = {} dictionary_data["Finance"] = {} dictionary_data["Travel"] = {} dictionary_data["Health"] = {} dictionary_data["World"] = {} dictionary_data["Sports"] = {} # my magic two stemmer and distance calculator calculator = WordDistance() stemmer = Stemmer() # read through all the articles in each folder for category in category_list: file_list = glob.glob("Corpus/" + category + "/*.txt") # begin to read file one by one for file_name in file_list: #print "Processing file: " + file_name with open(file_name) as f: for line in f: line = line.replace("\n", "").decode('utf-8').encode('utf-8') word_list = line.split(" ") for word in word_list: if word not in dictionary_data[category]: dictionary_data[category][word] = 1