예제 #1
0
def stopwordgen():
    try:
        with open('fdist.pkl') as f:
            fdist = pickle.load(f)
            print 'loading'
    except:
        fdist = freqtoeba.fdist_gen('eng', 'sentences.csv')
        with open('fdist.pkl','w') as f:
            pickle.dump(fdist, f)

    # fdist = freqtoeba.fdist_gen('hin', 'sentences.csv')
    x = fdist.values()
    maximum = fdist[fdist.max()]
    y = [50 for item in x]
    pylab.plot(x, y, 'm.', label='sampled')
    stopwords = list(load_set_stopwords('stopword/stop-words_english_1_en.txt'))
    
    STOPWORDS = list(set(nltk.corpus.stopwords.words('english')))
    totstopwords = len(STOPWORDS)
    for word in STOPWORDS:
        pylab.plot([fdist[word.lower()]], [45], 'r.')
        # print word, fdist[word.lower()]
    xstop = []
    ystop = []
    stopwordenc = 0
    xnor = []
    ynor = []
    norenc = 0
    norperc = 0
    for word in fdist:
        
        # word = fdist[sample]
        # print word
        if word in STOPWORDS:
            stopwordenc += 1
            # stopwordperc = (float(totstopwords - stopwordenc)/totstopwords)*100
            stopwordperc = (float(stopwordenc)/totstopwords)*100
            xstop.append(fdist[word.lower()])
            ystop.append(stopwordperc)
        else:
            norenc += 1
            norperc = (float(norenc)/len(fdist))*100
            xnor.append(fdist[word])
            ynor.append(norperc)

    pylab.plot(xstop, ystop, linestyle='-', color='c')
    # pylab.plot(xnor, ynor, linestyle='--', color='b')
    pylab.grid(True)
    # print xnor
    # print ynor
    # print x
    # print mean(x)

    #plotting the number of words against frequency
    scores = {}
    wordloss = 0
    for word in fdist:
        try:
            scores[fdist[word]] = scores[fdist[word]] + 1
        except:
            scores[fdist[word]] =  1
        if fdist[word] > 3540 and word not in STOPWORDS:
            wordloss =  wordloss + 1


    pylab.plot(scores.keys(), scores.values(), linestyle = '-', color = 'b')
    print 'word loss is  ', wordloss

    # print scores
    print len(STOPWORDS)

    #end of that part
    pylab.ylim(-2, 110)
    pylab.xlim(-2000, maximum)
    pylab.xlabel('Frequency')
    pylab.ylabel('Percentage of stopwords above threshold')
    pylab.title('Stopwords and Frequencies Experiment')
    pylab.show()
예제 #2
0
def sentence_picker(min_thresh, max_thresh, uncommon_thresh, lines, newlinechar):
    '''Return list of useful sentences picked out of dump of sentences
    given to it as input.

    Arguements min_thresh and max_thresh are the minimum and maximum
    length of sentences allowed.

    Lines is a list of sentences in unicode.

    newlinechar is the char that documents uses for newlines. Project
    Gutenberg uses '\r\n'
    '''
    sent_dia = []
    picked_sentences = {}

    weight_iwf = 1
    weight_common = 1
    weight_len = 1
    weight_firstchar = 1

    count = 0
    counttot = len(lines)
    count_above_max = 0
    count_below_min = 0
    count_dialogue = 0
    #TODO fdist should be loaded from a pickle, this takes a lot of time
    fdist = freqtoeba.fdist_gen('eng', 'sentences.csv', 'stopword/stop-words_english_1_en.txt')


    #We create the set initially here so that we don't have to create
    # it time and again.
    topwordset = pop_word_set_gen('wikifiction.txt')

    for i in range(0, len(lines)):

        #normalizing wrapped test
        lines[i] = utility.unwrapper(lines[i], newlinechar)




        #tokenizing each sentence into words which are lowercase
        sent_words = wordtokenizer.wordtokenizer('eng', lines[i])

        sentlen = len(sent_words)
        # print lines[i].encode('utf-8')
        # print 'h\n'


        sent_less_stop_words = utility.less_stopwords(sent_words)


        sentlen_less_stop_words = len(sent_less_stop_words)





        #TODO For the time being , later get better score using missing vocab

        score = 0

        score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True)
        score = score + weight_firstchar * score_firstchar_upper(lines[i])

        countinfo = topwordcheck(sent_less_stop_words, topwordset)

        score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)

        score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True)
        # print lines[i], score_iwf(sent_less_stop_words, fdist)

        picked_sentences[lines[i]] = score


        if sentlen > max_thresh:
            dialogues = re.findall(r'"(.*?)"', lines[i])
            for dialogue in dialogues:
                score = 0
                dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
                dialen = len(dia_words)
                dia_less_stop_words = utility.less_stopwords(dia_words)
                dialen_less_stop_words = len(dia_less_stop_words)
                countinfo = topwordcheck(dia_less_stop_words, topwordset)

                score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True)
                score = score + weight_firstchar * score_firstchar_upper(dialogue)
                score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)
                score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True)
                # print dialogue, score_iwf(dia_less_stop_words, fdist)
                picked_sentences[dialogue] = score



    #     if sentlen >= min_thresh and sentlen <= max_thresh:
    #         count = count + 1
    #         # print lines[i]

    #         num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1]
    #         if num_common_words < sentlen_less_stop_words-1:
    #             continue
    #         if not is_firstchar_upper(lines[i]):
    #             continue

    #         picked_sentences[punc_strippers(lines[i])] = score
    #         # print '\n'

    #     if sentlen > max_thresh:
    #         dialogues = re.findall(r'"(.*?)"', lines[i])
    #         # print dialogues
    #         for dialogue in dialogues:
    #             dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
    #             wrdcnt = len(dia_words)
    #             dia_less_stop_words = utility.less_stopwords(dia_words)
    #             dialen_less_stop_words = len(dia_less_stop_words)
    #             score = dialen_less_stop_words
    #             if wrdcnt >= 4:
    #                 # print dialogue.encode('utf-8')

    #                 num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1]
    #                 if num_common_words < dialen_less_stop_words-1:
    #                     continue
    #                 if not is_firstchar_upper(dialogue):
    #                     continue

    #                 sent_dia.append(dialogue)
    #                 picked_sentences[dialogue] = score
                    
    #                 count_dialogue += 1
    #         count_above_max += 1
    #     if sentlen < min_thresh:
    #         count_below_min += 1


    return picked_sentences
예제 #3
0
def sentence_picker(text, lang, min_thresh, max_thresh, uncommon_thresh, newlinechar, weight_iwf=1, weight_common=1, weight_firstchar=1, weight_len=1):
    '''Return list of useful sentences picked out of dump of sentences
    given to it as input.

    Arguements min_thresh and max_thresh are the minimum and maximum
    length of sentences allowed.

    Weight Dict is a dictionary with weights as values and scoring fucntions as keys

    Lines is a list of sentences in unicode.

    newlinechar is the char that documents uses for newlines. Project
    Gutenberg uses '\r\n'
    '''

    import os
    print os.getcwd()
    print type(text)
    lines = sentencesplitter.splitter(text, lang)
    sent_dia = []
    picked_sentences = {}
    count = 0
    counttot = len(lines)
    count_above_max = 0
    count_below_min = 0
    count_dialogue = 0

    fdist = freqtoeba.fdist_loader(lang)
    if fdist == -1:
        stopwordpath = os.path.join(os.path.dirname(__file__),'..' ,'..','stopword','stop-words_english_1_en.txt')
        print 'sw path is: ', stopwordpath
        fdist = freqtoeba.fdist_gen('eng', 'sentences.csv')
        with open('fdist.pkl','w') as f:
            pickle.dump(fdist, f)


    #We create the set initially here so that we don't have to create
    # it time and again.

    file_name = 'wikifiction.txt'
    newfilepath = os.path.join(os.path.dirname(__file__), file_name)
    topwordset = pop_word_set_gen(newfilepath)

    for i in range(0, len(lines)):

        #normalizing wrapped test
        lines[i] = utility.unwrapper(lines[i], newlinechar)




        #tokenizing each sentence into words which are lowercase
        sent_words = wordtokenizer.wordtokenizer('eng', lines[i])

        sentlen = len(sent_words)
        # print lines[i].encode('utf-8')
        # print 'h\n'


        sent_less_stop_words = utility.less_stopwords(sent_words)


        sentlen_less_stop_words = len(sent_less_stop_words)





        #TODO For the time being , later get better score using missing vocab

        score = 0

        score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True)
        score = score + weight_firstchar * score_firstchar_upper(lines[i])

        countinfo = topwordcheck(sent_less_stop_words, topwordset)

        score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)

        score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True)
        # print lines[i], score_iwf(sent_less_stop_words, fdist)

        totscore = score

        score_length_num = score_length(min_thresh, max_thresh, sentlen, equally=True)
        score_length_val = score_length_num, score_length_num*weight_len

        score_firstchar_upper_num = score_firstchar_upper(lines[i])
        score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num*weight_firstchar


        score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True)
        score_common_words_val = score_common_words_num, weight_common*score_common_words_num


        score_iwf_num = score_iwf(sent_less_stop_words, fdist, counttot, True)
        score_iwf_val =  score_iwf_num, weight_iwf*score_iwf_num

        picked_sentences[lines[i]] = score
        picked_sentences[lines[i]] = SENTENCE(
            text=lines[i],
            score_iwf = score_iwf_val,
            score_length = score_length_val,
            score_common_words = score_common_words_val,
            score_firstchar_upper = score_firstchar_upper_val,
            score = totscore
            )




        if sentlen > max_thresh:
            dialogues = re.findall(r'"(.*?)"', lines[i])
            for dialogue in dialogues:
                score = 0
                dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
                dialen = len(dia_words)
                dia_less_stop_words = utility.less_stopwords(dia_words)
                dialen_less_stop_words = len(dia_less_stop_words)
                countinfo = topwordcheck(dia_less_stop_words, topwordset)

                score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True)
                score = score + weight_firstchar * score_firstchar_upper(dialogue)
                score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)
                score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True)
                # print dialogue, score_iwf(dia_less_stop_words, fdist)

                totscore = score

                score_length_num = score_length(min_thresh, max_thresh, dialen, equally=True)
                score_length_val = score_length_num, weight_len * score_length_num

                score_firstchar_upper_num = score_firstchar_upper(dialogue)
                score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num * weight_firstchar


                score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True)
                score_common_words_val = score_common_words_num, weight_common * score_common_words_num


                score_iwf_num =  score_iwf(dia_less_stop_words, fdist, counttot, True)
                score_iwf_val =  score_iwf_num, weight_iwf * score_iwf_num

                picked_sentences[dialogue] = score
                picked_sentences[dialogue] = SENTENCE(
                    text=dialogue,
                    score_iwf = score_iwf_val,
                    score_length = score_length_val,
                    score_common_words = score_common_words_val,
                    score_firstchar_upper = score_firstchar_upper_val,
                    score = totscore
                    )




    #     if sentlen >= min_thresh and sentlen <= max_thresh:
    #         count = count + 1
    #         # print lines[i]

    #         num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1]
    #         if num_common_words < sentlen_less_stop_words-1:
    #             continue
    #         if not is_firstchar_upper(lines[i]):
    #             continue

    #         picked_sentences[punc_strippers(lines[i])] = score
    #         # print '\n'

    #     if sentlen > max_thresh:
    #         dialogues = re.findall(r'"(.*?)"', lines[i])
    #         # print dialogues
    #         for dialogue in dialogues:
    #             dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
    #             wrdcnt = len(dia_words)
    #             dia_less_stop_words = utility.less_stopwords(dia_words)
    #             dialen_less_stop_words = len(dia_less_stop_words)
    #             score = dialen_less_stop_words
    #             if wrdcnt >= 4:
    #                 # print dialogue.encode('utf-8')

    #                 num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1]
    #                 if num_common_words < dialen_less_stop_words-1:
    #                     continue
    #                 if not is_firstchar_upper(dialogue):
    #                     continue

    #                 sent_dia.append(dialogue)
    #                 picked_sentences[dialogue] = score

    #                 count_dialogue += 1
    #         count_above_max += 1
    #     if sentlen < min_thresh:
    #         count_below_min += 1


    sorted_picked = sorted(picked_sentences.iteritems(), key=dict_sort, reverse = True)
    print sorted_picked[-1]
    return sorted_picked