def sentencesimilarity(lang1,sentence1, lang2,sentence2): '''Lang1 will be converted to lang2 and then similarity between the newly translated sentece and lang2 willbe calculated We try to keep lang2 as english ''' print 'First sentence bag got is : ' sentence1_words = wordtokenizer.wordtokenizer(lang1,sentence1.lower()) print sentence1_words print 'Second sentence bag got is : ' sentence2_words = wordtokenizer.wordtokenizer(lang2,sentence2.lower()) print sentence2_words # return '1' translated_sentence = [] for word in sentence1_words: translated_word = translation_lookup(word) print translated_word if translated_word: translated_sentence.extend(translated_word.split()) print 'Translated bag for sentence1 is : ' print translated_sentence #Now we have translated sentence in bag of words format. We need to check how many of these words exist in sentence2 same_word_count = 0 for word in translated_sentence: if word in sentence2_words: print word same_word_count = same_word_count + 1 return same_word_count
def sentencesimilarity(lang1, sentence1, lang2, sentence2): '''Lang1 will be converted to lang2 and then similarity between the newly translated sentece and lang2 willbe calculated We try to keep lang2 as english ''' print 'First sentence bag got is : ' sentence1_words = wordtokenizer.wordtokenizer(lang1,sentence1.lower()) test = [] for word in sentence1_words: print word.encode('utf-8') + ',', len_sent_one = len(sentence1_words) print '\nSecond sentence bag got is : ' sentence2_words = wordtokenizer.wordtokenizer(lang2,sentence2.lower()) print sentence2_words len_sent_two = len(sentence2_words) len_differece = len_sent_one - len_sent_two print 'Difference in sentence lenghts is %s - %s = %s' %(len_sent_one, len_sent_two, len_differece) # return '1' translated_sentence = [] for word in sentence1_words: translated_word = translation_lookup(word) #print translated_word if translated_word: translated_sentence.extend(translated_word.split()) print 'Translated bag for sentence1 is : ' print translated_sentence #Now we have translated sentence in bag of words format. We need to check how many of these words exist in sentence2 same_word_count = 0 for word in translated_sentence: if word in sentence2_words: print word same_word_count = same_word_count + 1 return [same_word_count,len_differece]
def fdist_gen(lang, corpusfile, stopwordfile=False): '''Returns the frequency distribution (nltk.probability.FreqDist) of a language based on the exisiting Tatoeba corpus. If stopwordfile set to true, removes the stop words from the frequency distrubution based on the stopword list provided to the function. ''' sentences = corpusreader.corpusreader(lang, corpusfile) fdist = nltk.FreqDist() # STOPWORDS = set(nltk.corpus.stopwords.words('english')) if stopwordfile: stopwordset = load_set_stopwords(stopwordfile) else: stopwordset = False for sentence in sentences: words = wordtokenizer.wordtokenizer('hin', sentence) # print len(words) words = less_stopwords(words, stopwordset) # print len(words) # print '\n' # print words for word in words: fdist.inc(word.lower()) # for key in fdist.keys()[:50]: # print key.encode('utf-8'), fdist[key] # print fdist['football'] # print fdist['soccer'] # print fdist['tom'] # fdist.plot(500) print type(fdist) return fdist
def sentence_picker(min_thresh, max_thresh, uncommon_thresh, lines, newlinechar): '''Return list of useful sentences picked out of dump of sentences given to it as input. Arguements min_thresh and max_thresh are the minimum and maximum length of sentences allowed. Lines is a list of sentences in unicode. newlinechar is the char that documents uses for newlines. Project Gutenberg uses '\r\n' ''' sent_dia = [] picked_sentences = {} weight_iwf = 1 weight_common = 1 weight_len = 1 weight_firstchar = 1 count = 0 counttot = len(lines) count_above_max = 0 count_below_min = 0 count_dialogue = 0 #TODO fdist should be loaded from a pickle, this takes a lot of time fdist = freqtoeba.fdist_gen('eng', 'sentences.csv', 'stopword/stop-words_english_1_en.txt') #We create the set initially here so that we don't have to create # it time and again. topwordset = pop_word_set_gen('wikifiction.txt') for i in range(0, len(lines)): #normalizing wrapped test lines[i] = utility.unwrapper(lines[i], newlinechar) #tokenizing each sentence into words which are lowercase sent_words = wordtokenizer.wordtokenizer('eng', lines[i]) sentlen = len(sent_words) # print lines[i].encode('utf-8') # print 'h\n' sent_less_stop_words = utility.less_stopwords(sent_words) sentlen_less_stop_words = len(sent_less_stop_words) #TODO For the time being , later get better score using missing vocab score = 0 score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True) score = score + weight_firstchar * score_firstchar_upper(lines[i]) countinfo = topwordcheck(sent_less_stop_words, topwordset) score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True) score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True) # print lines[i], score_iwf(sent_less_stop_words, fdist) picked_sentences[lines[i]] = score if sentlen > max_thresh: dialogues = re.findall(r'"(.*?)"', lines[i]) for dialogue in dialogues: score = 0 dia_words = wordtokenizer.wordtokenizer('eng', dialogue) dialen = len(dia_words) dia_less_stop_words = utility.less_stopwords(dia_words) dialen_less_stop_words = len(dia_less_stop_words) countinfo = topwordcheck(dia_less_stop_words, topwordset) score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True) score = score + weight_firstchar * score_firstchar_upper(dialogue) score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True) score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True) # print dialogue, score_iwf(dia_less_stop_words, fdist) picked_sentences[dialogue] = score # if sentlen >= min_thresh and sentlen <= max_thresh: # count = count + 1 # # print lines[i] # num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1] # if num_common_words < sentlen_less_stop_words-1: # continue # if not is_firstchar_upper(lines[i]): # continue # picked_sentences[punc_strippers(lines[i])] = score # # print '\n' # if sentlen > max_thresh: # dialogues = re.findall(r'"(.*?)"', lines[i]) # # print dialogues # for dialogue in dialogues: # dia_words = wordtokenizer.wordtokenizer('eng', dialogue) # wrdcnt = len(dia_words) # dia_less_stop_words = utility.less_stopwords(dia_words) # dialen_less_stop_words = len(dia_less_stop_words) # score = dialen_less_stop_words # if wrdcnt >= 4: # # print dialogue.encode('utf-8') # num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1] # if num_common_words < dialen_less_stop_words-1: # continue # if not is_firstchar_upper(dialogue): # continue # sent_dia.append(dialogue) # picked_sentences[dialogue] = score # count_dialogue += 1 # count_above_max += 1 # if sentlen < min_thresh: # count_below_min += 1 return picked_sentences
def sentence_picker(text, lang, min_thresh, max_thresh, uncommon_thresh, newlinechar, weight_iwf=1, weight_common=1, weight_firstchar=1, weight_len=1): '''Return list of useful sentences picked out of dump of sentences given to it as input. Arguements min_thresh and max_thresh are the minimum and maximum length of sentences allowed. Weight Dict is a dictionary with weights as values and scoring fucntions as keys Lines is a list of sentences in unicode. newlinechar is the char that documents uses for newlines. Project Gutenberg uses '\r\n' ''' import os print os.getcwd() print type(text) lines = sentencesplitter.splitter(text, lang) sent_dia = [] picked_sentences = {} count = 0 counttot = len(lines) count_above_max = 0 count_below_min = 0 count_dialogue = 0 fdist = freqtoeba.fdist_loader(lang) if fdist == -1: stopwordpath = os.path.join(os.path.dirname(__file__),'..' ,'..','stopword','stop-words_english_1_en.txt') print 'sw path is: ', stopwordpath fdist = freqtoeba.fdist_gen('eng', 'sentences.csv') with open('fdist.pkl','w') as f: pickle.dump(fdist, f) #We create the set initially here so that we don't have to create # it time and again. file_name = 'wikifiction.txt' newfilepath = os.path.join(os.path.dirname(__file__), file_name) topwordset = pop_word_set_gen(newfilepath) for i in range(0, len(lines)): #normalizing wrapped test lines[i] = utility.unwrapper(lines[i], newlinechar) #tokenizing each sentence into words which are lowercase sent_words = wordtokenizer.wordtokenizer('eng', lines[i]) sentlen = len(sent_words) # print lines[i].encode('utf-8') # print 'h\n' sent_less_stop_words = utility.less_stopwords(sent_words) sentlen_less_stop_words = len(sent_less_stop_words) #TODO For the time being , later get better score using missing vocab score = 0 score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True) score = score + weight_firstchar * score_firstchar_upper(lines[i]) countinfo = topwordcheck(sent_less_stop_words, topwordset) score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True) score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True) # print lines[i], score_iwf(sent_less_stop_words, fdist) totscore = score score_length_num = score_length(min_thresh, max_thresh, sentlen, equally=True) score_length_val = score_length_num, score_length_num*weight_len score_firstchar_upper_num = score_firstchar_upper(lines[i]) score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num*weight_firstchar score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True) score_common_words_val = score_common_words_num, weight_common*score_common_words_num score_iwf_num = score_iwf(sent_less_stop_words, fdist, counttot, True) score_iwf_val = score_iwf_num, weight_iwf*score_iwf_num picked_sentences[lines[i]] = score picked_sentences[lines[i]] = SENTENCE( text=lines[i], score_iwf = score_iwf_val, score_length = score_length_val, score_common_words = score_common_words_val, score_firstchar_upper = score_firstchar_upper_val, score = totscore ) if sentlen > max_thresh: dialogues = re.findall(r'"(.*?)"', lines[i]) for dialogue in dialogues: score = 0 dia_words = wordtokenizer.wordtokenizer('eng', dialogue) dialen = len(dia_words) dia_less_stop_words = utility.less_stopwords(dia_words) dialen_less_stop_words = len(dia_less_stop_words) countinfo = topwordcheck(dia_less_stop_words, topwordset) score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True) score = score + weight_firstchar * score_firstchar_upper(dialogue) score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True) score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True) # print dialogue, score_iwf(dia_less_stop_words, fdist) totscore = score score_length_num = score_length(min_thresh, max_thresh, dialen, equally=True) score_length_val = score_length_num, weight_len * score_length_num score_firstchar_upper_num = score_firstchar_upper(dialogue) score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num * weight_firstchar score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True) score_common_words_val = score_common_words_num, weight_common * score_common_words_num score_iwf_num = score_iwf(dia_less_stop_words, fdist, counttot, True) score_iwf_val = score_iwf_num, weight_iwf * score_iwf_num picked_sentences[dialogue] = score picked_sentences[dialogue] = SENTENCE( text=dialogue, score_iwf = score_iwf_val, score_length = score_length_val, score_common_words = score_common_words_val, score_firstchar_upper = score_firstchar_upper_val, score = totscore ) # if sentlen >= min_thresh and sentlen <= max_thresh: # count = count + 1 # # print lines[i] # num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1] # if num_common_words < sentlen_less_stop_words-1: # continue # if not is_firstchar_upper(lines[i]): # continue # picked_sentences[punc_strippers(lines[i])] = score # # print '\n' # if sentlen > max_thresh: # dialogues = re.findall(r'"(.*?)"', lines[i]) # # print dialogues # for dialogue in dialogues: # dia_words = wordtokenizer.wordtokenizer('eng', dialogue) # wrdcnt = len(dia_words) # dia_less_stop_words = utility.less_stopwords(dia_words) # dialen_less_stop_words = len(dia_less_stop_words) # score = dialen_less_stop_words # if wrdcnt >= 4: # # print dialogue.encode('utf-8') # num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1] # if num_common_words < dialen_less_stop_words-1: # continue # if not is_firstchar_upper(dialogue): # continue # sent_dia.append(dialogue) # picked_sentences[dialogue] = score # count_dialogue += 1 # count_above_max += 1 # if sentlen < min_thresh: # count_below_min += 1 sorted_picked = sorted(picked_sentences.iteritems(), key=dict_sort, reverse = True) print sorted_picked[-1] return sorted_picked