def phrase_ext():
    phrase_list = []
    for i in range(length_dataset):
        alignment_list = []
        list_fr = sentences_fr[i].split()
        list_en = sentences_en[i].split()
        for j in range(len(list_en)):
            alignment_tuple = ()
            word = final_dict[list_en[j]]
            for k in range(len(list_fr)):
                if list_fr[k] == word:
                    alignment_tuple = (j, k)
                    alignment_list.append(alignment_tuple)

        #print(alignment_list)
        phrases = phrase_extraction(sentences_en[i], sentences_fr[i],
                                    alignment_list)
        #print(phrases)
        phrases = list(sorted(phrases))
        for l in range(len(phrases)):
            #print(phrases[l])
            l_phrases = list(phrases[l])
            phrase_list.append(l_phrases)

    return phrase_list
Пример #2
0
def phrase_bases_extraction(filename, foreign):
    corpus = load(filename)
    srctext = [corpus[i][foreign] for i in range(len(corpus))]
    trgtext = [corpus[i]['en'] for i in range(len(corpus))]
    with open(foreign + '.pickle', 'rb') as infile:
        aligned = pickle.load(infile)
    phrase_list = []
    for i in range(len(srctext)):
        phrases = pb.phrase_extraction(srctext[i], trgtext[i], aligned[i])
        phrase_list.append(phrases)

    ranks = {}
    for i in phrase_list:
        for _ in i:
            count_num = 0
            count_den = 0
            fr = _[2]
            eng = _[3]
            for pair in corpus:
                if (fr in pair[foreign]):
                    count_den = count_den + 1
                    if (eng in pair['en']):
                        count_num = count_num + 1
            rank = count_num / count_den
            ranks[(fr, eng)] = rank
    sorted_x = sorted(ranks.items(), key=operator.itemgetter(1))
    sorted_x.reverse()
    pprint(sorted_x)
Пример #3
0
def build_phrases(bitext):

    print("--- Building phrases")
    phrases = []

    for b in bitext:
        bitext_words = ' '.join(word for word in b.words if word != '')
        bitext_mots = ' '.join(mot for mot in b.mots if mot != '')
        phrase = phrase_based.phrase_extraction(bitext_words, bitext_mots, b.alignment, 2)
        phrases.append(phrase)

    return phrases
Пример #4
0
def task_3(parallel_corpus, phrase_extraction_corpus_en,
           phrase_extraction_corpus_fr, alignments_pred):
    """
    Task 3: Utility for calculating phrase based extraction scoring
    :param parallel_corpus: Processed Bitext
    :param phrase_extraction_corpus_en
    :param phrase_extraction_corpus_fr
    :param alignments_pred: Alignment list computed in task 1
    :return: execution time
    """

    start = time.process_time()
    alignments = []
    print("Phrase Extraction")

    en_fr_phrases = []
    fr_phrases = []
    # print(phrase_extraction_corpus_en)
    for i in range(len(phrase_extraction_corpus_en)):
        # print(alignments_pred[phrase_extraction_corpus_en[i]])
        # print(alignments[i])
        # srctext = "michael assumes that he will stay in the house"
        # trgtext = "michael geht davon aus , dass er im haus bleibt"
        # alignment = [(0, 0), (1, 1), (1, 2), (1, 3), (2, 5), (3, 6), (4, 9), (5, 9), (6, 7), (7, 7), (8, 8)]
        # phrases = phrase_based.phrase_extraction(srctext, trgtext, alignment)
        # print(phrase_extraction_corpus_en[i])
        phrases = phrase_based.phrase_extraction(
            phrase_extraction_corpus_en[i], phrase_extraction_corpus_fr[i],
            alignments_pred[phrase_extraction_corpus_en[i]])
        # print("here")
        # for i in sorted(phrases):
        #     print(i)
        for _, _, e_ph, f_ph in sorted(phrases):
            en_fr_phrases.append((e_ph, f_ph))
            fr_phrases.append(f_ph)

    en_fr_phrases_count = Counter(en_fr_phrases)
    fr_phrases_count = Counter(fr_phrases)
    result = []
    # print(en_fr_phrases_count)
    # print(fr_phrases_count)
    for e, f in en_fr_phrases:
        result.append(
            ((en_fr_phrases_count[(e, f)] / fr_phrases_count[f]), (e, f)))

    for i in reversed(sorted(set(result))):
        print(i)

    end = time.process_time()
    exec_time = str(end - start)

    return exec_time
Пример #5
0
    def extract_phrase_table(self, bisent, alignment):
        PT = []
        srctext = bisent[0].tokenized_sentence()
        trgtext = bisent[1].tokenized_sentence()
        # print "EPT:SRC:"+srctext.encode('utf-8')
        # print "EPT:TGT:"+trgtext.encode('utf-8')
        # print "EPT:INPUT_ALIGNMENT:"+str(alignment)
        phrase_pairs = phrase_extraction(srctext, trgtext, alignment)
        # if True:
        #     print str(len(phrase_pairs))+" pairs to add for this sentence"

        #print "PHRASE PAIRS: "+str(phrase_pairs)
        for p in phrase_pairs:
            PT.append((self.apply_offset(p[0], bisent[0].offset),
                       self.apply_offset(p[1], bisent[1].offset)))
        return PT
Пример #6
0
    # examples=get_examples(settings)
    # ibm,corpus=use_IBM1(corpus,settings)
    # current_probs,corpus=drive(corpus)
    # foreign_eng contains the pairs of foreign and english words and their corresponding count
    # eng contains the english words and their corresponding count
    f = open(examples, 'r', encoding='utf-8')
    corpus = json.load(f)
    f.close()
    prob_table, aligned_obj = drive(corpus)
    foreign_eng = {}
    english = {}
    for i in range(len(aligned_obj)):
        print(corpus[i][native_lang])
        print(corpus[i][foreign_lang])
        phrases = phrase_based.phrase_extraction(corpus[i][native_lang],
                                                 corpus[i][foreign_lang],
                                                 aligned_obj[i][2])
        for i in sorted(phrases):
            t = (i[2], i[3])
            if t not in foreign_eng:  # check for first occurence of phrase pair
                foreign_eng[t] = 1
            else:  #If phrase pair has already been encountered, increase count
                foreign_eng[t] += 1
            if i[2] not in english:  #Check for first occurence of foreign phrase
                english[i[2]] = 1
            else:  #If foreign phrase has already been encountered, increase count
                english[i[2]] += 1

    # scores of the phrases is determined by the formula score(f,e) = count(f,e)/count(f)
    scores = {}
Пример #7
0
def main():
    start = time.time()
    #parsing the json file
    with open(FILE, 'r') as f:
        corpus = json.load(f)
    Model1_table, aligned1 = IBM_Model_1(corpus)

    alignments_of_1 = []
    words_of_1 = []
    mots_of_1 = []
    #storing the information got from the IBM Model 1 in lists
    for test in aligned1:
        alignments_of_1.append(test.alignment)
        words_of_1.append(test.words)
        mots_of_1.append(test.mots)
    print("")
    c = 0
    #traversing the corpus
    for x in corpus:
        srctext = x[SOURCE_LANGUAGE]
        destext = x[DESTINATION_LANGUAGE]
        align = alignments_of_1[c]
        print("Source sentence:")
        print(words_of_1[c])
        print("Destination sentence:")
        print(mots_of_1[c])
        print("Alignment:")
        print(align)
        print("")
        c = c + 1
        sorted_phrase_score = list()
        #calling the inbuilt function to extract phrases
        phrases = phrase_extraction(srctext, destext, align)
        for i in phrases:
            SOURCE_phrase = i[2]
            DESTINATION_phrase = i[3]
            count_numerator = 0.0
            count_denominator = 0.0
            for y in corpus:
                #checking if both the phrases are in the sentence
                if SOURCE_phrase in y[
                        SOURCE_LANGUAGE] and DESTINATION_phrase in y[
                            DESTINATION_LANGUAGE]:
                    count_numerator = count_numerator + 1
                #checking if the source phrase is in the source sentence
                if SOURCE_phrase in y[SOURCE_LANGUAGE]:
                    count_denominator = count_denominator + 1
            #calculating the phrase score
            phrase_score = count_numerator / count_denominator
            #adding the phrase score to a list
            sorted_phrase_score.append(
                (phrase_score, SOURCE_phrase, DESTINATION_phrase))
        #printing the output in descending order of the phrase score
        for values in sorted(sorted_phrase_score, reverse=True):
            print("Source phrase:")
            print(values[1])
            print("Destination phrase:")
            print(values[2])
            print("Phrase Score:")
            print(values[0])
            print("")

    #printing runtime
    print("")
    print("Time:")
    print(time.time() - start)
Пример #8
0
from nltk.translate import phrase_based
from part2 import use_IBM1, get_examples, get_data
import pprint
# from part1 import drive
if __name__ == '__main__':
    corpus, source_set, target_set, settings = get_data()
    examples = get_examples(settings)
    ibm, corpus = use_IBM1(corpus, settings)

    # foreign_eng contains the pairs of foreign and english words and their corresponding count
    # eng contains the english words and their corresponding count
    foreign_eng = {}
    english = {}
    for i in range(len(examples)):
        phrases = phrase_based.phrase_extraction(
            examples[i][settings['source']], examples[i][settings['target']],
            corpus[i].alignment)
        for i in sorted(phrases):
            t = (i[2], i[3]
                 )  # tuple containing the source and the translation pair
            if t not in foreign_eng:  # check for first occurence of phrase pair
                foreign_eng[t] = 1
            else:  #If phrase pair has already been encountered, increase count
                foreign_eng[t] += 1
            if i[2] not in english:  #Check for first occurence of source phrase
                english[i[2]] = 1
            else:  #If source phrase has already been encountered, increase count
                english[i[2]] += 1

    # scores of the phrases is determined by the formula score(t,s) = count(t,s)/count(s)
    scores = {}
	for item in tuple(bitext[i].alignment):

        # only keep word pairings where neither of the words is None
		if None not in item:
			newAlignment.append(item)
	bitext[i].alignment = Alignment(newAlignment)


all_phrases = []
for pair in bitext:

	srctext = ' '.join(word for word in pair.words)
	trgtext = ' '.join(word for word in pair.mots)
	alignment = tuple(pair.alignment)

	phrases = phrase_extraction(srctext, trgtext, alignment)
	for phrase in phrases:
		all_phrases.append(phrase)

# build dict matching english phrases to spanish phrases
phrase_occ = {}
for row in all_phrases:
    src = row[2]
    trg = row[3]
    if src not in phrase_occ:
        translations = defaultdict()
        translations[trg] = 1
        phrase_occ[src] = translations
    elif trg not in phrase_occ[src]:
        phrase_occ[src][trg] = 1
    else:
Пример #10
0
    #Construct the source and target texts
    srctext = ""
    trgtext = ""
    for e_word in e:
        srctext += e_word
        srctext += ' '
    for f_word in f:
        trgtext += f_word
        trgtext += ' '
    srctext = srctext[:-1]
    trgtext = trgtext[:-1]
    #print(srctext)
    #print(trgtext)

    #Obtain phrase tuples from phrase_extraction module
    phrases = phrase_extraction(srctext, trgtext, align_ibm)
    for phrase in sorted(phrases):
        en_phrase = phrase[2]  #English phrase
        fr_phrase = phrase[3]  #French phrase
        #Increment count of the French phrase
        if (fr_phrase not in count_fr_phrase):
            count_fr_phrase[fr_phrase] = 1
        else:
            count_fr_phrase[fr_phrase] += 1

        #Increment count of the pair of English phrase, French phrase
        if ((en_phrase, fr_phrase) not in count_en_fr_phrase):
            count_en_fr_phrase[(en_phrase, fr_phrase)] = 1
        else:
            count_en_fr_phrase[(en_phrase, fr_phrase)] += 1
Пример #11
0
    A_text.append(AlignedSent(l_4, l_3))
ib1 = i1.IBMModel1(A_text, 5)

l_align = []
for i in A_text:
    temp1 = i.alignment
    temp2 = []
    for x in range(len(temp1)):
        temp2.append(temp1[x][0])
    l_align.append(temp2)

x = 0
laoded_dictionary_it = {}
myDict = {}
for i in range(len(english)):
    phrases = pb.phrase_extraction(english[i], German[i], l_align[i])
    for j in sorted(phrases):
        k = (j[2], j[3], len(j[2].split()))
        l = j[3]
        if k in laoded_dictionary_it.keys():
            laoded_dictionary_it[k] = laoded_dictionary_it[k] + 1
        else:
            laoded_dictionary_it[k] = 1
            x = x + 1
        if l in myDict.keys():
            myDict[l] = myDict[l] + 1
        else:
            myDict[l] = 1

for i in laoded_dictionary_it.keys():
    laoded_dictionary_it[i] /= myDict[i[1]]
def main():
    '''
        This is the core logic of our program.
    '''
    test_corpus = True
    custom_corpus = False

    cwd = getcwd()
    # read data from the given dataset
    with open(cwd + '\\data1.json') as f:
        json_data1 = f.read()
    with open(cwd + '\\data2.json') as f:
        json_data2 = f.read()
    with open(cwd + '\\Alternative Corpus\\parallel.json') as f:
        json_data3 = f.read()
    # data is in JSON format and hence needs to be parsed
    data1 = json.loads(json_data1)
    data2 = json.loads(json_data2)
    data3 = json.loads(json_data3)
    # create an aligned corpus for phrase extraction
    bitext = []
    for sentence in data1:
        bitext.append(
            AlignedSent(word_tokenize(sentence['fr'], language='french'),
                        word_tokenize(sentence['en'], language='english')))

    # run the model (model 1)  for 10 iterations
    model1 = IBMModel1(bitext, 10)

    # print(bitext, '\n\n')

    # get the word translation table
    translate_table_1 = model1.translation_table

    # extract alignments and show them
    alignments_extracted_1 = []
    for temp in bitext:
        alignments_extracted_1.append(temp.alignment)
    print(alignments_extracted_1)

    # Similarly run the model2 and print the results
    bitext = []
    for sentence in data1:
        bitext.append(
            AlignedSent(word_tokenize(sentence['fr'], language='french'),
                        word_tokenize(sentence['en'], language='english')))
    model2 = IBMModel2(bitext, 10)
    translate_table_2 = model2.translation_table

    alignments_extracted_2 = []
    for temp in bitext:
        alignments_extracted_2.append(temp.alignment)
    # print('\n\n',alignments_extracted_2,'\n\n')
    print('finished\n')

    # if this is true, run the phrase translation model for data2.json
    ''''''
    if test_corpus:
        bitext_test = []
        # get the parallel sentences here
        for sentence in data2:
            bitext_test.append(
                AlignedSent(word_tokenize(sentence['fr'], language='french'),
                            word_tokenize(sentence['en'], language='english')))

        # test model for extracting phrases, MODEL 1, used to extract phrases:
        test_model = IBMModel1(bitext_test, 10)
        alignments_test = bitext_test[0].alignment

        # print(bitext_test[0], bitext_test[0].alignment)
        phrases = phrase_extraction(data2[0]['fr'], data2[0]['en'],
                                    alignments_test)
        for i in phrases:
            print(i, '\n\n')
        '''
        
        CHANDRAHAS ADD DESCRIPTION HERE. OPTIONALLY, SAVE THE TRANSLATIONS
        
        '''

        countef = defaultdict()
        countf = defaultdict()

        for sent in range(len(data3)):
            phrases = phrase_extraction(data2[sent]['fr'], data2[sent]['en'],
                                        alignments_test)
            for phrase in phrases:
                pair = (phrase[2], phrase[3])
                print(pair)
                if pair not in countef:
                    countef[pair] = 1
                else:
                    countef[pair] = countef[pair] + 1

        for word in countef:
            fword = word[0]
            if fword not in countf:
                countf[fword] = countef[word]
            else:
                countf[fword] = countf[fword] + countef[word]
        print('ranks: \n\n')

        final = defaultdict(dict)

        for word in countef:
            val = countef[word] / countf[word[0]]
            # print(word,"    ", val)
            final[word[0]][word[1]] = val

        # print(final)

        for entity in final:
            # print(entity)
            current = final[entity]
            print(entity)
            # print(current)
            d_descending = sorted(current.items(),
                                  key=lambda kv: kv[1],
                                  reverse=True)
            for i in d_descending:
                print(i)
            print("\n")

        # print(final)

    # print("final")
    # for word in countef:
    # print(word,"    ",countef[word])

    # if this is set as true, do the same for the dataset that we generated.
    if custom_corpus:
        bitext_test = []
        for sentence in data3:
            bitext_test.append(
                AlignedSent(word_tokenize(sentence['gr'], language='german'),
                            word_tokenize(sentence['en'], language='english')))

        # test model for extracting phrases, MODEL 1:
        test_model = IBMModel1(bitext_test, 10)
        alignments_test = bitext_test[0].alignment
        '''
        
        CHANDRAHAS ADD DESCRIPTION HERE
        
        '''
        countef = defaultdict()
        countf = defaultdict()

        for sent in range(len(data3)):
            phrases = phrase_extraction(data3[sent]['gr'], data3[sent]['en'],
                                        alignments_test)
            for phrase in phrases:
                pair = (phrase[2], phrase[3])
                # print(pair)
                if pair not in countef:
                    countef[pair] = 1
                else:
                    countef[pair] = countef[pair] + 1

        for word in countef:
            fword = word[0]
            if fword not in countf:
                countf[fword] = countef[word]
            else:
                countf[fword] = countf[fword] + countef[word]
        print('ranks: \n\n')

        final = defaultdict(dict)

        for word in countef:
            val = countef[word] / countf[word[0]]
            # print(word,"    ", val)
            final[word[0]][word[1]] = val

        # print(final)

        for entity in final:
            # print(entity)
            current = final[entity]
            print(entity)
            # print(current)
            d_descending = sorted(current.items(),
                                  key=lambda kv: kv[1],
                                  reverse=True)
            for i in d_descending:
                print(i)
            print("\n")

        # print(final)

    print("final")
    for word in countef:
        # print(word, "    ", countef[word])

        # for i in phrases:
        #     for j in phrases:
        #         for word in i:
        for sent in range(len(data3)):
            phrases = phrase_extraction(data3[sent]['gr'], data3[sent]['en'],
                                        alignments_test)
            for phrase in phrases:
                pair = (phrase[2], phrase[3])
                # print(pair)
                if pair not in countef:
                    countef[pair] = 1
                else:
                    countef[pair] = countef[pair] + 1

        for word in countef:
            fword = word[0]
            if fword not in countf:
                countf[fword] = countef[word]
            else:
                countf[fword] = countf[fword] + countef[word]
        print('ranks: \n\n')

        final = defaultdict(dict)

        for word in countef:
            val = countef[word] / countf[word[0]]
            # print(word,"    ", val)
            final[word[0]][word[1]] = val

    print(final)

    for entity in final:
        # print(entity)
        current = final[entity]
        print(entity)
        # print(current)
        d_descending = sorted(current.items(),
                              key=lambda kv: kv[1],
                              reverse=True)
        for i in d_descending:
            print(i)
        print("\n")