예제 #1
0
def sentence_picker(min_thresh, max_thresh, uncommon_thresh, lines, newlinechar):
    '''Return list of useful sentences picked out of dump of sentences
    given to it as input.

    Arguements min_thresh and max_thresh are the minimum and maximum
    length of sentences allowed.

    Lines is a list of sentences in unicode.

    newlinechar is the char that documents uses for newlines. Project
    Gutenberg uses '\r\n'
    '''
    sent_dia = []
    picked_sentences = {}

    weight_iwf = 1
    weight_common = 1
    weight_len = 1
    weight_firstchar = 1

    count = 0
    counttot = len(lines)
    count_above_max = 0
    count_below_min = 0
    count_dialogue = 0
    #TODO fdist should be loaded from a pickle, this takes a lot of time
    fdist = freqtoeba.fdist_gen('eng', 'sentences.csv', 'stopword/stop-words_english_1_en.txt')


    #We create the set initially here so that we don't have to create
    # it time and again.
    topwordset = pop_word_set_gen('wikifiction.txt')

    for i in range(0, len(lines)):

        #normalizing wrapped test
        lines[i] = utility.unwrapper(lines[i], newlinechar)




        #tokenizing each sentence into words which are lowercase
        sent_words = wordtokenizer.wordtokenizer('eng', lines[i])

        sentlen = len(sent_words)
        # print lines[i].encode('utf-8')
        # print 'h\n'


        sent_less_stop_words = utility.less_stopwords(sent_words)


        sentlen_less_stop_words = len(sent_less_stop_words)





        #TODO For the time being , later get better score using missing vocab

        score = 0

        score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True)
        score = score + weight_firstchar * score_firstchar_upper(lines[i])

        countinfo = topwordcheck(sent_less_stop_words, topwordset)

        score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)

        score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True)
        # print lines[i], score_iwf(sent_less_stop_words, fdist)

        picked_sentences[lines[i]] = score


        if sentlen > max_thresh:
            dialogues = re.findall(r'"(.*?)"', lines[i])
            for dialogue in dialogues:
                score = 0
                dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
                dialen = len(dia_words)
                dia_less_stop_words = utility.less_stopwords(dia_words)
                dialen_less_stop_words = len(dia_less_stop_words)
                countinfo = topwordcheck(dia_less_stop_words, topwordset)

                score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True)
                score = score + weight_firstchar * score_firstchar_upper(dialogue)
                score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)
                score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True)
                # print dialogue, score_iwf(dia_less_stop_words, fdist)
                picked_sentences[dialogue] = score



    #     if sentlen >= min_thresh and sentlen <= max_thresh:
    #         count = count + 1
    #         # print lines[i]

    #         num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1]
    #         if num_common_words < sentlen_less_stop_words-1:
    #             continue
    #         if not is_firstchar_upper(lines[i]):
    #             continue

    #         picked_sentences[punc_strippers(lines[i])] = score
    #         # print '\n'

    #     if sentlen > max_thresh:
    #         dialogues = re.findall(r'"(.*?)"', lines[i])
    #         # print dialogues
    #         for dialogue in dialogues:
    #             dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
    #             wrdcnt = len(dia_words)
    #             dia_less_stop_words = utility.less_stopwords(dia_words)
    #             dialen_less_stop_words = len(dia_less_stop_words)
    #             score = dialen_less_stop_words
    #             if wrdcnt >= 4:
    #                 # print dialogue.encode('utf-8')

    #                 num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1]
    #                 if num_common_words < dialen_less_stop_words-1:
    #                     continue
    #                 if not is_firstchar_upper(dialogue):
    #                     continue

    #                 sent_dia.append(dialogue)
    #                 picked_sentences[dialogue] = score
                    
    #                 count_dialogue += 1
    #         count_above_max += 1
    #     if sentlen < min_thresh:
    #         count_below_min += 1


    return picked_sentences
예제 #2
0
def sentence_picker(text, lang, min_thresh, max_thresh, uncommon_thresh, newlinechar, weight_iwf=1, weight_common=1, weight_firstchar=1, weight_len=1):
    '''Return list of useful sentences picked out of dump of sentences
    given to it as input.

    Arguements min_thresh and max_thresh are the minimum and maximum
    length of sentences allowed.

    Weight Dict is a dictionary with weights as values and scoring fucntions as keys

    Lines is a list of sentences in unicode.

    newlinechar is the char that documents uses for newlines. Project
    Gutenberg uses '\r\n'
    '''

    import os
    print os.getcwd()
    print type(text)
    lines = sentencesplitter.splitter(text, lang)
    sent_dia = []
    picked_sentences = {}
    count = 0
    counttot = len(lines)
    count_above_max = 0
    count_below_min = 0
    count_dialogue = 0

    fdist = freqtoeba.fdist_loader(lang)
    if fdist == -1:
        stopwordpath = os.path.join(os.path.dirname(__file__),'..' ,'..','stopword','stop-words_english_1_en.txt')
        print 'sw path is: ', stopwordpath
        fdist = freqtoeba.fdist_gen('eng', 'sentences.csv')
        with open('fdist.pkl','w') as f:
            pickle.dump(fdist, f)


    #We create the set initially here so that we don't have to create
    # it time and again.

    file_name = 'wikifiction.txt'
    newfilepath = os.path.join(os.path.dirname(__file__), file_name)
    topwordset = pop_word_set_gen(newfilepath)

    for i in range(0, len(lines)):

        #normalizing wrapped test
        lines[i] = utility.unwrapper(lines[i], newlinechar)




        #tokenizing each sentence into words which are lowercase
        sent_words = wordtokenizer.wordtokenizer('eng', lines[i])

        sentlen = len(sent_words)
        # print lines[i].encode('utf-8')
        # print 'h\n'


        sent_less_stop_words = utility.less_stopwords(sent_words)


        sentlen_less_stop_words = len(sent_less_stop_words)





        #TODO For the time being , later get better score using missing vocab

        score = 0

        score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True)
        score = score + weight_firstchar * score_firstchar_upper(lines[i])

        countinfo = topwordcheck(sent_less_stop_words, topwordset)

        score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)

        score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True)
        # print lines[i], score_iwf(sent_less_stop_words, fdist)

        totscore = score

        score_length_num = score_length(min_thresh, max_thresh, sentlen, equally=True)
        score_length_val = score_length_num, score_length_num*weight_len

        score_firstchar_upper_num = score_firstchar_upper(lines[i])
        score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num*weight_firstchar


        score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True)
        score_common_words_val = score_common_words_num, weight_common*score_common_words_num


        score_iwf_num = score_iwf(sent_less_stop_words, fdist, counttot, True)
        score_iwf_val =  score_iwf_num, weight_iwf*score_iwf_num

        picked_sentences[lines[i]] = score
        picked_sentences[lines[i]] = SENTENCE(
            text=lines[i],
            score_iwf = score_iwf_val,
            score_length = score_length_val,
            score_common_words = score_common_words_val,
            score_firstchar_upper = score_firstchar_upper_val,
            score = totscore
            )




        if sentlen > max_thresh:
            dialogues = re.findall(r'"(.*?)"', lines[i])
            for dialogue in dialogues:
                score = 0
                dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
                dialen = len(dia_words)
                dia_less_stop_words = utility.less_stopwords(dia_words)
                dialen_less_stop_words = len(dia_less_stop_words)
                countinfo = topwordcheck(dia_less_stop_words, topwordset)

                score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True)
                score = score + weight_firstchar * score_firstchar_upper(dialogue)
                score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)
                score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True)
                # print dialogue, score_iwf(dia_less_stop_words, fdist)

                totscore = score

                score_length_num = score_length(min_thresh, max_thresh, dialen, equally=True)
                score_length_val = score_length_num, weight_len * score_length_num

                score_firstchar_upper_num = score_firstchar_upper(dialogue)
                score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num * weight_firstchar


                score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True)
                score_common_words_val = score_common_words_num, weight_common * score_common_words_num


                score_iwf_num =  score_iwf(dia_less_stop_words, fdist, counttot, True)
                score_iwf_val =  score_iwf_num, weight_iwf * score_iwf_num

                picked_sentences[dialogue] = score
                picked_sentences[dialogue] = SENTENCE(
                    text=dialogue,
                    score_iwf = score_iwf_val,
                    score_length = score_length_val,
                    score_common_words = score_common_words_val,
                    score_firstchar_upper = score_firstchar_upper_val,
                    score = totscore
                    )




    #     if sentlen >= min_thresh and sentlen <= max_thresh:
    #         count = count + 1
    #         # print lines[i]

    #         num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1]
    #         if num_common_words < sentlen_less_stop_words-1:
    #             continue
    #         if not is_firstchar_upper(lines[i]):
    #             continue

    #         picked_sentences[punc_strippers(lines[i])] = score
    #         # print '\n'

    #     if sentlen > max_thresh:
    #         dialogues = re.findall(r'"(.*?)"', lines[i])
    #         # print dialogues
    #         for dialogue in dialogues:
    #             dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
    #             wrdcnt = len(dia_words)
    #             dia_less_stop_words = utility.less_stopwords(dia_words)
    #             dialen_less_stop_words = len(dia_less_stop_words)
    #             score = dialen_less_stop_words
    #             if wrdcnt >= 4:
    #                 # print dialogue.encode('utf-8')

    #                 num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1]
    #                 if num_common_words < dialen_less_stop_words-1:
    #                     continue
    #                 if not is_firstchar_upper(dialogue):
    #                     continue

    #                 sent_dia.append(dialogue)
    #                 picked_sentences[dialogue] = score

    #                 count_dialogue += 1
    #         count_above_max += 1
    #     if sentlen < min_thresh:
    #         count_below_min += 1


    sorted_picked = sorted(picked_sentences.iteritems(), key=dict_sort, reverse = True)
    print sorted_picked[-1]
    return sorted_picked