def stopwordgen(): try: with open('fdist.pkl') as f: fdist = pickle.load(f) print 'loading' except: fdist = freqtoeba.fdist_gen('eng', 'sentences.csv') with open('fdist.pkl','w') as f: pickle.dump(fdist, f) # fdist = freqtoeba.fdist_gen('hin', 'sentences.csv') x = fdist.values() maximum = fdist[fdist.max()] y = [50 for item in x] pylab.plot(x, y, 'm.', label='sampled') stopwords = list(load_set_stopwords('stopword/stop-words_english_1_en.txt')) STOPWORDS = list(set(nltk.corpus.stopwords.words('english'))) totstopwords = len(STOPWORDS) for word in STOPWORDS: pylab.plot([fdist[word.lower()]], [45], 'r.') # print word, fdist[word.lower()] xstop = [] ystop = [] stopwordenc = 0 xnor = [] ynor = [] norenc = 0 norperc = 0 for word in fdist: # word = fdist[sample] # print word if word in STOPWORDS: stopwordenc += 1 # stopwordperc = (float(totstopwords - stopwordenc)/totstopwords)*100 stopwordperc = (float(stopwordenc)/totstopwords)*100 xstop.append(fdist[word.lower()]) ystop.append(stopwordperc) else: norenc += 1 norperc = (float(norenc)/len(fdist))*100 xnor.append(fdist[word]) ynor.append(norperc) pylab.plot(xstop, ystop, linestyle='-', color='c') # pylab.plot(xnor, ynor, linestyle='--', color='b') pylab.grid(True) # print xnor # print ynor # print x # print mean(x) #plotting the number of words against frequency scores = {} wordloss = 0 for word in fdist: try: scores[fdist[word]] = scores[fdist[word]] + 1 except: scores[fdist[word]] = 1 if fdist[word] > 3540 and word not in STOPWORDS: wordloss = wordloss + 1 pylab.plot(scores.keys(), scores.values(), linestyle = '-', color = 'b') print 'word loss is ', wordloss # print scores print len(STOPWORDS) #end of that part pylab.ylim(-2, 110) pylab.xlim(-2000, maximum) pylab.xlabel('Frequency') pylab.ylabel('Percentage of stopwords above threshold') pylab.title('Stopwords and Frequencies Experiment') pylab.show()
def sentence_picker(min_thresh, max_thresh, uncommon_thresh, lines, newlinechar): '''Return list of useful sentences picked out of dump of sentences given to it as input. Arguements min_thresh and max_thresh are the minimum and maximum length of sentences allowed. Lines is a list of sentences in unicode. newlinechar is the char that documents uses for newlines. Project Gutenberg uses '\r\n' ''' sent_dia = [] picked_sentences = {} weight_iwf = 1 weight_common = 1 weight_len = 1 weight_firstchar = 1 count = 0 counttot = len(lines) count_above_max = 0 count_below_min = 0 count_dialogue = 0 #TODO fdist should be loaded from a pickle, this takes a lot of time fdist = freqtoeba.fdist_gen('eng', 'sentences.csv', 'stopword/stop-words_english_1_en.txt') #We create the set initially here so that we don't have to create # it time and again. topwordset = pop_word_set_gen('wikifiction.txt') for i in range(0, len(lines)): #normalizing wrapped test lines[i] = utility.unwrapper(lines[i], newlinechar) #tokenizing each sentence into words which are lowercase sent_words = wordtokenizer.wordtokenizer('eng', lines[i]) sentlen = len(sent_words) # print lines[i].encode('utf-8') # print 'h\n' sent_less_stop_words = utility.less_stopwords(sent_words) sentlen_less_stop_words = len(sent_less_stop_words) #TODO For the time being , later get better score using missing vocab score = 0 score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True) score = score + weight_firstchar * score_firstchar_upper(lines[i]) countinfo = topwordcheck(sent_less_stop_words, topwordset) score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True) score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True) # print lines[i], score_iwf(sent_less_stop_words, fdist) picked_sentences[lines[i]] = score if sentlen > max_thresh: dialogues = re.findall(r'"(.*?)"', lines[i]) for dialogue in dialogues: score = 0 dia_words = wordtokenizer.wordtokenizer('eng', dialogue) dialen = len(dia_words) dia_less_stop_words = utility.less_stopwords(dia_words) dialen_less_stop_words = len(dia_less_stop_words) countinfo = topwordcheck(dia_less_stop_words, topwordset) score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True) score = score + weight_firstchar * score_firstchar_upper(dialogue) score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True) score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True) # print dialogue, score_iwf(dia_less_stop_words, fdist) picked_sentences[dialogue] = score # if sentlen >= min_thresh and sentlen <= max_thresh: # count = count + 1 # # print lines[i] # num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1] # if num_common_words < sentlen_less_stop_words-1: # continue # if not is_firstchar_upper(lines[i]): # continue # picked_sentences[punc_strippers(lines[i])] = score # # print '\n' # if sentlen > max_thresh: # dialogues = re.findall(r'"(.*?)"', lines[i]) # # print dialogues # for dialogue in dialogues: # dia_words = wordtokenizer.wordtokenizer('eng', dialogue) # wrdcnt = len(dia_words) # dia_less_stop_words = utility.less_stopwords(dia_words) # dialen_less_stop_words = len(dia_less_stop_words) # score = dialen_less_stop_words # if wrdcnt >= 4: # # print dialogue.encode('utf-8') # num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1] # if num_common_words < dialen_less_stop_words-1: # continue # if not is_firstchar_upper(dialogue): # continue # sent_dia.append(dialogue) # picked_sentences[dialogue] = score # count_dialogue += 1 # count_above_max += 1 # if sentlen < min_thresh: # count_below_min += 1 return picked_sentences
def sentence_picker(text, lang, min_thresh, max_thresh, uncommon_thresh, newlinechar, weight_iwf=1, weight_common=1, weight_firstchar=1, weight_len=1): '''Return list of useful sentences picked out of dump of sentences given to it as input. Arguements min_thresh and max_thresh are the minimum and maximum length of sentences allowed. Weight Dict is a dictionary with weights as values and scoring fucntions as keys Lines is a list of sentences in unicode. newlinechar is the char that documents uses for newlines. Project Gutenberg uses '\r\n' ''' import os print os.getcwd() print type(text) lines = sentencesplitter.splitter(text, lang) sent_dia = [] picked_sentences = {} count = 0 counttot = len(lines) count_above_max = 0 count_below_min = 0 count_dialogue = 0 fdist = freqtoeba.fdist_loader(lang) if fdist == -1: stopwordpath = os.path.join(os.path.dirname(__file__),'..' ,'..','stopword','stop-words_english_1_en.txt') print 'sw path is: ', stopwordpath fdist = freqtoeba.fdist_gen('eng', 'sentences.csv') with open('fdist.pkl','w') as f: pickle.dump(fdist, f) #We create the set initially here so that we don't have to create # it time and again. file_name = 'wikifiction.txt' newfilepath = os.path.join(os.path.dirname(__file__), file_name) topwordset = pop_word_set_gen(newfilepath) for i in range(0, len(lines)): #normalizing wrapped test lines[i] = utility.unwrapper(lines[i], newlinechar) #tokenizing each sentence into words which are lowercase sent_words = wordtokenizer.wordtokenizer('eng', lines[i]) sentlen = len(sent_words) # print lines[i].encode('utf-8') # print 'h\n' sent_less_stop_words = utility.less_stopwords(sent_words) sentlen_less_stop_words = len(sent_less_stop_words) #TODO For the time being , later get better score using missing vocab score = 0 score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True) score = score + weight_firstchar * score_firstchar_upper(lines[i]) countinfo = topwordcheck(sent_less_stop_words, topwordset) score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True) score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True) # print lines[i], score_iwf(sent_less_stop_words, fdist) totscore = score score_length_num = score_length(min_thresh, max_thresh, sentlen, equally=True) score_length_val = score_length_num, score_length_num*weight_len score_firstchar_upper_num = score_firstchar_upper(lines[i]) score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num*weight_firstchar score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True) score_common_words_val = score_common_words_num, weight_common*score_common_words_num score_iwf_num = score_iwf(sent_less_stop_words, fdist, counttot, True) score_iwf_val = score_iwf_num, weight_iwf*score_iwf_num picked_sentences[lines[i]] = score picked_sentences[lines[i]] = SENTENCE( text=lines[i], score_iwf = score_iwf_val, score_length = score_length_val, score_common_words = score_common_words_val, score_firstchar_upper = score_firstchar_upper_val, score = totscore ) if sentlen > max_thresh: dialogues = re.findall(r'"(.*?)"', lines[i]) for dialogue in dialogues: score = 0 dia_words = wordtokenizer.wordtokenizer('eng', dialogue) dialen = len(dia_words) dia_less_stop_words = utility.less_stopwords(dia_words) dialen_less_stop_words = len(dia_less_stop_words) countinfo = topwordcheck(dia_less_stop_words, topwordset) score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True) score = score + weight_firstchar * score_firstchar_upper(dialogue) score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True) score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True) # print dialogue, score_iwf(dia_less_stop_words, fdist) totscore = score score_length_num = score_length(min_thresh, max_thresh, dialen, equally=True) score_length_val = score_length_num, weight_len * score_length_num score_firstchar_upper_num = score_firstchar_upper(dialogue) score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num * weight_firstchar score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True) score_common_words_val = score_common_words_num, weight_common * score_common_words_num score_iwf_num = score_iwf(dia_less_stop_words, fdist, counttot, True) score_iwf_val = score_iwf_num, weight_iwf * score_iwf_num picked_sentences[dialogue] = score picked_sentences[dialogue] = SENTENCE( text=dialogue, score_iwf = score_iwf_val, score_length = score_length_val, score_common_words = score_common_words_val, score_firstchar_upper = score_firstchar_upper_val, score = totscore ) # if sentlen >= min_thresh and sentlen <= max_thresh: # count = count + 1 # # print lines[i] # num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1] # if num_common_words < sentlen_less_stop_words-1: # continue # if not is_firstchar_upper(lines[i]): # continue # picked_sentences[punc_strippers(lines[i])] = score # # print '\n' # if sentlen > max_thresh: # dialogues = re.findall(r'"(.*?)"', lines[i]) # # print dialogues # for dialogue in dialogues: # dia_words = wordtokenizer.wordtokenizer('eng', dialogue) # wrdcnt = len(dia_words) # dia_less_stop_words = utility.less_stopwords(dia_words) # dialen_less_stop_words = len(dia_less_stop_words) # score = dialen_less_stop_words # if wrdcnt >= 4: # # print dialogue.encode('utf-8') # num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1] # if num_common_words < dialen_less_stop_words-1: # continue # if not is_firstchar_upper(dialogue): # continue # sent_dia.append(dialogue) # picked_sentences[dialogue] = score # count_dialogue += 1 # count_above_max += 1 # if sentlen < min_thresh: # count_below_min += 1 sorted_picked = sorted(picked_sentences.iteritems(), key=dict_sort, reverse = True) print sorted_picked[-1] return sorted_picked