예제 #1
0
def sentencesimilarity(lang1,sentence1, lang2,sentence2):
    '''Lang1 will be converted to lang2 and then similarity between the newly translated sentece and lang2 willbe calculated
       We try to keep lang2 as english

    '''
    print 'First sentence bag got is : '
    sentence1_words = wordtokenizer.wordtokenizer(lang1,sentence1.lower())
    print sentence1_words

    print 'Second sentence bag got is : '
    sentence2_words = wordtokenizer.wordtokenizer(lang2,sentence2.lower())
    print sentence2_words

    # return '1'
    translated_sentence = []
    for word in sentence1_words:
        translated_word = translation_lookup(word)
        print translated_word
        if translated_word:
            translated_sentence.extend(translated_word.split())

    print 'Translated bag for sentence1 is : '
    print translated_sentence

    #Now we have translated sentence in bag of words format. We need to check how many of these words exist in sentence2
    same_word_count = 0
    for word in translated_sentence:
        if word in sentence2_words:
            print word
            same_word_count = same_word_count + 1

    return same_word_count
예제 #2
0
def sentencesimilarity(lang1, sentence1, lang2, sentence2):
    '''Lang1 will be converted to lang2 and then similarity between the newly
     translated sentece and lang2 willbe calculated
       We try to keep lang2 as english

    '''
    print 'First sentence bag got is : '
    sentence1_words = wordtokenizer.wordtokenizer(lang1,sentence1.lower())
    test = []
    for word in sentence1_words:
        print word.encode('utf-8') + ',',
    len_sent_one = len(sentence1_words)
    


    print '\nSecond sentence bag got is : '
    sentence2_words = wordtokenizer.wordtokenizer(lang2,sentence2.lower())
    print sentence2_words

    len_sent_two = len(sentence2_words)
    len_differece = len_sent_one - len_sent_two
    print 'Difference in sentence lenghts is %s - %s = %s' %(len_sent_one, len_sent_two, len_differece)

    # return '1'
    translated_sentence = []
    for word in sentence1_words:
        translated_word = translation_lookup(word)
        #print translated_word
        if translated_word:
            translated_sentence.extend(translated_word.split())

    print 'Translated bag for sentence1 is : '
    print translated_sentence

    #Now we have translated sentence in bag of words format. We need to check how many of these words exist in sentence2
    same_word_count = 0
    for word in translated_sentence:
        if word in sentence2_words:
            print word
            same_word_count = same_word_count + 1

    return [same_word_count,len_differece]
예제 #3
0
def fdist_gen(lang, corpusfile, stopwordfile=False):
    '''Returns the frequency distribution (nltk.probability.FreqDist)
    of a language based on the exisiting
    Tatoeba corpus. If stopwordfile set to true, removes the stop words from
    the frequency distrubution based on the stopword list provided to the
    function.
    '''

    sentences = corpusreader.corpusreader(lang, corpusfile)

    fdist = nltk.FreqDist()
    # STOPWORDS = set(nltk.corpus.stopwords.words('english'))
    if stopwordfile:
        stopwordset = load_set_stopwords(stopwordfile)
    else:
        stopwordset = False
    for sentence in sentences:
        words = wordtokenizer.wordtokenizer('hin', sentence)
        # print len(words)
        words = less_stopwords(words, stopwordset)
        # print len(words)
        # print '\n'
        # print words
        for word in words:

            fdist.inc(word.lower())

    # for key in fdist.keys()[:50]:
        # print key.encode('utf-8'), fdist[key]

    # print fdist['football']
    # print fdist['soccer']
    # print fdist['tom']


    # fdist.plot(500)
    print type(fdist)
    return fdist
예제 #4
0
def sentence_picker(min_thresh, max_thresh, uncommon_thresh, lines, newlinechar):
    '''Return list of useful sentences picked out of dump of sentences
    given to it as input.

    Arguements min_thresh and max_thresh are the minimum and maximum
    length of sentences allowed.

    Lines is a list of sentences in unicode.

    newlinechar is the char that documents uses for newlines. Project
    Gutenberg uses '\r\n'
    '''
    sent_dia = []
    picked_sentences = {}

    weight_iwf = 1
    weight_common = 1
    weight_len = 1
    weight_firstchar = 1

    count = 0
    counttot = len(lines)
    count_above_max = 0
    count_below_min = 0
    count_dialogue = 0
    #TODO fdist should be loaded from a pickle, this takes a lot of time
    fdist = freqtoeba.fdist_gen('eng', 'sentences.csv', 'stopword/stop-words_english_1_en.txt')


    #We create the set initially here so that we don't have to create
    # it time and again.
    topwordset = pop_word_set_gen('wikifiction.txt')

    for i in range(0, len(lines)):

        #normalizing wrapped test
        lines[i] = utility.unwrapper(lines[i], newlinechar)




        #tokenizing each sentence into words which are lowercase
        sent_words = wordtokenizer.wordtokenizer('eng', lines[i])

        sentlen = len(sent_words)
        # print lines[i].encode('utf-8')
        # print 'h\n'


        sent_less_stop_words = utility.less_stopwords(sent_words)


        sentlen_less_stop_words = len(sent_less_stop_words)





        #TODO For the time being , later get better score using missing vocab

        score = 0

        score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True)
        score = score + weight_firstchar * score_firstchar_upper(lines[i])

        countinfo = topwordcheck(sent_less_stop_words, topwordset)

        score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)

        score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True)
        # print lines[i], score_iwf(sent_less_stop_words, fdist)

        picked_sentences[lines[i]] = score


        if sentlen > max_thresh:
            dialogues = re.findall(r'"(.*?)"', lines[i])
            for dialogue in dialogues:
                score = 0
                dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
                dialen = len(dia_words)
                dia_less_stop_words = utility.less_stopwords(dia_words)
                dialen_less_stop_words = len(dia_less_stop_words)
                countinfo = topwordcheck(dia_less_stop_words, topwordset)

                score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True)
                score = score + weight_firstchar * score_firstchar_upper(dialogue)
                score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)
                score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True)
                # print dialogue, score_iwf(dia_less_stop_words, fdist)
                picked_sentences[dialogue] = score



    #     if sentlen >= min_thresh and sentlen <= max_thresh:
    #         count = count + 1
    #         # print lines[i]

    #         num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1]
    #         if num_common_words < sentlen_less_stop_words-1:
    #             continue
    #         if not is_firstchar_upper(lines[i]):
    #             continue

    #         picked_sentences[punc_strippers(lines[i])] = score
    #         # print '\n'

    #     if sentlen > max_thresh:
    #         dialogues = re.findall(r'"(.*?)"', lines[i])
    #         # print dialogues
    #         for dialogue in dialogues:
    #             dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
    #             wrdcnt = len(dia_words)
    #             dia_less_stop_words = utility.less_stopwords(dia_words)
    #             dialen_less_stop_words = len(dia_less_stop_words)
    #             score = dialen_less_stop_words
    #             if wrdcnt >= 4:
    #                 # print dialogue.encode('utf-8')

    #                 num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1]
    #                 if num_common_words < dialen_less_stop_words-1:
    #                     continue
    #                 if not is_firstchar_upper(dialogue):
    #                     continue

    #                 sent_dia.append(dialogue)
    #                 picked_sentences[dialogue] = score
                    
    #                 count_dialogue += 1
    #         count_above_max += 1
    #     if sentlen < min_thresh:
    #         count_below_min += 1


    return picked_sentences
예제 #5
0
def sentence_picker(text, lang, min_thresh, max_thresh, uncommon_thresh, newlinechar, weight_iwf=1, weight_common=1, weight_firstchar=1, weight_len=1):
    '''Return list of useful sentences picked out of dump of sentences
    given to it as input.

    Arguements min_thresh and max_thresh are the minimum and maximum
    length of sentences allowed.

    Weight Dict is a dictionary with weights as values and scoring fucntions as keys

    Lines is a list of sentences in unicode.

    newlinechar is the char that documents uses for newlines. Project
    Gutenberg uses '\r\n'
    '''

    import os
    print os.getcwd()
    print type(text)
    lines = sentencesplitter.splitter(text, lang)
    sent_dia = []
    picked_sentences = {}
    count = 0
    counttot = len(lines)
    count_above_max = 0
    count_below_min = 0
    count_dialogue = 0

    fdist = freqtoeba.fdist_loader(lang)
    if fdist == -1:
        stopwordpath = os.path.join(os.path.dirname(__file__),'..' ,'..','stopword','stop-words_english_1_en.txt')
        print 'sw path is: ', stopwordpath
        fdist = freqtoeba.fdist_gen('eng', 'sentences.csv')
        with open('fdist.pkl','w') as f:
            pickle.dump(fdist, f)


    #We create the set initially here so that we don't have to create
    # it time and again.

    file_name = 'wikifiction.txt'
    newfilepath = os.path.join(os.path.dirname(__file__), file_name)
    topwordset = pop_word_set_gen(newfilepath)

    for i in range(0, len(lines)):

        #normalizing wrapped test
        lines[i] = utility.unwrapper(lines[i], newlinechar)




        #tokenizing each sentence into words which are lowercase
        sent_words = wordtokenizer.wordtokenizer('eng', lines[i])

        sentlen = len(sent_words)
        # print lines[i].encode('utf-8')
        # print 'h\n'


        sent_less_stop_words = utility.less_stopwords(sent_words)


        sentlen_less_stop_words = len(sent_less_stop_words)





        #TODO For the time being , later get better score using missing vocab

        score = 0

        score = score + weight_len * score_length(min_thresh, max_thresh, sentlen, equally=True)
        score = score + weight_firstchar * score_firstchar_upper(lines[i])

        countinfo = topwordcheck(sent_less_stop_words, topwordset)

        score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)

        score = score + weight_iwf * score_iwf(sent_less_stop_words, fdist, counttot, True)
        # print lines[i], score_iwf(sent_less_stop_words, fdist)

        totscore = score

        score_length_num = score_length(min_thresh, max_thresh, sentlen, equally=True)
        score_length_val = score_length_num, score_length_num*weight_len

        score_firstchar_upper_num = score_firstchar_upper(lines[i])
        score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num*weight_firstchar


        score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True)
        score_common_words_val = score_common_words_num, weight_common*score_common_words_num


        score_iwf_num = score_iwf(sent_less_stop_words, fdist, counttot, True)
        score_iwf_val =  score_iwf_num, weight_iwf*score_iwf_num

        picked_sentences[lines[i]] = score
        picked_sentences[lines[i]] = SENTENCE(
            text=lines[i],
            score_iwf = score_iwf_val,
            score_length = score_length_val,
            score_common_words = score_common_words_val,
            score_firstchar_upper = score_firstchar_upper_val,
            score = totscore
            )




        if sentlen > max_thresh:
            dialogues = re.findall(r'"(.*?)"', lines[i])
            for dialogue in dialogues:
                score = 0
                dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
                dialen = len(dia_words)
                dia_less_stop_words = utility.less_stopwords(dia_words)
                dialen_less_stop_words = len(dia_less_stop_words)
                countinfo = topwordcheck(dia_less_stop_words, topwordset)

                score = score + weight_len * score_length(min_thresh, max_thresh, dialen, equally=True)
                score = score + weight_firstchar * score_firstchar_upper(dialogue)
                score = score + weight_common * score_common_words(uncommon_thresh, countinfo, equally=True)
                score = score + weight_iwf * score_iwf(dia_less_stop_words, fdist, counttot, True)
                # print dialogue, score_iwf(dia_less_stop_words, fdist)

                totscore = score

                score_length_num = score_length(min_thresh, max_thresh, dialen, equally=True)
                score_length_val = score_length_num, weight_len * score_length_num

                score_firstchar_upper_num = score_firstchar_upper(dialogue)
                score_firstchar_upper_val = score_firstchar_upper_num, score_firstchar_upper_num * weight_firstchar


                score_common_words_num = score_common_words(uncommon_thresh, countinfo, equally=True)
                score_common_words_val = score_common_words_num, weight_common * score_common_words_num


                score_iwf_num =  score_iwf(dia_less_stop_words, fdist, counttot, True)
                score_iwf_val =  score_iwf_num, weight_iwf * score_iwf_num

                picked_sentences[dialogue] = score
                picked_sentences[dialogue] = SENTENCE(
                    text=dialogue,
                    score_iwf = score_iwf_val,
                    score_length = score_length_val,
                    score_common_words = score_common_words_val,
                    score_firstchar_upper = score_firstchar_upper_val,
                    score = totscore
                    )




    #     if sentlen >= min_thresh and sentlen <= max_thresh:
    #         count = count + 1
    #         # print lines[i]

    #         num_common_words = topwordcheck(sent_less_stop_words, topwordset)[1]
    #         if num_common_words < sentlen_less_stop_words-1:
    #             continue
    #         if not is_firstchar_upper(lines[i]):
    #             continue

    #         picked_sentences[punc_strippers(lines[i])] = score
    #         # print '\n'

    #     if sentlen > max_thresh:
    #         dialogues = re.findall(r'"(.*?)"', lines[i])
    #         # print dialogues
    #         for dialogue in dialogues:
    #             dia_words = wordtokenizer.wordtokenizer('eng', dialogue)
    #             wrdcnt = len(dia_words)
    #             dia_less_stop_words = utility.less_stopwords(dia_words)
    #             dialen_less_stop_words = len(dia_less_stop_words)
    #             score = dialen_less_stop_words
    #             if wrdcnt >= 4:
    #                 # print dialogue.encode('utf-8')

    #                 num_common_words = topwordcheck(dia_less_stop_words, topwordset)[1]
    #                 if num_common_words < dialen_less_stop_words-1:
    #                     continue
    #                 if not is_firstchar_upper(dialogue):
    #                     continue

    #                 sent_dia.append(dialogue)
    #                 picked_sentences[dialogue] = score

    #                 count_dialogue += 1
    #         count_above_max += 1
    #     if sentlen < min_thresh:
    #         count_below_min += 1


    sorted_picked = sorted(picked_sentences.iteritems(), key=dict_sort, reverse = True)
    print sorted_picked[-1]
    return sorted_picked