示例#1
0
def labelTopK(topK_sentence_indexes, url, topK=10):

    analyse_url_new.get_details(url, False)
    sentence_list = analyse_url_new.get_sentences()

    TopKSentences = []
    for i in range(0, topK):
        TopKSentences.append(sentence_list[topK_sentence_indexes[i]])

    return TopKSentences
示例#2
0
def catchphrase_and_detail_extractor(link):
    '''
    Analyse link and get catchphrase & sentences.

    Implement here.
    '''
    '''
    list: doc --> doc --> doc
           |
           |-[0] catchphrase
           |-[1] sentence 1 in detail
           |-[2] sentence 2 in detail
           |- ...
           |-[N] sentence N in detail

    list_lineInDoc=['STRATA APPEAL – admission of new evidence – works to be undertaken by Owners Corporation, maintenance or new works – amendment of motions.',
              'The property known as [***] Avenue, Neutral Bay (SP 30995) comprises some 8 units with a total of 12 balconies. There are 4 units at the rear of the property being units 5, 6, 7 and 8. There are 6 balconies on the western side at the rear of the building and another 6 on the front and sides of the building. The balustrades along the western side are of glass and aluminium construction and they face west and offer views. The balustrades on the other sides (being 6 in total) are of masonry/brick construction.',
              'The appellants, who were also the applicants for the adjudication which includes balconies which were the subject of the resolution purportedly passed on 31 March 20215.']
    '''
    analyse_url_new.get_details(link)
    catchphrase = analyse_url_new.get_catchwords()
    list_lineInDoc = [catchphrase] + analyse_url_new.get_sentences()
    return list_lineInDoc
示例#3
0
def retTopK(
        url='http://www.austlii.edu.au/cgi-bin/viewdoc/au/cases/nsw/NSWCATCD/2017/3.html',
        log=False,
        showGraph=False):
    # weighting on ML. Below 0.5 due to inaccuracy of method.
    if (log):
        print(
            "\nFunction to return the top K (K is defined by the user) most relevant sentences.\n"
        )

    if (log):
        print(
            "An alpha of 0.25 is used to give less weighting to the Bayesian Model due to its relative inaccuracy."
        )
    alpha = 0.25

    # Read the first case in train_data.txt for testing.
    #input_path = "./model/train_data.txt"
    #list_lineInDoc, pos = readOneCase(file_pos, train_data_path=input_path)
    '''
    get catch words and sentences of a url.
    '''
    if (log): print('\nRetrieving case sentences...')

    if (url):
        analyse_url_new.get_details(url)

    catchphrase = analyse_url_new.get_catchwords()
    sentences = analyse_url_new.get_sentences()
    list_lineInDoc = []
    list_lineInDoc.append(catchphrase)
    list_lineInDoc.extend(sentences)
    '''
    list_lineInDoc is a sentence list..
    list_lineInDoc[0] is catchphrase.
    list_lineInDoc[1:n] is body sentences.
    '''
    s1 = []
    s2 = []

    print("************************************************")
    print("Bayesian Model Prediction")
    print("************************************************")
    print("")
    s1 = predictBayesianModel(list_lineInDoc)

    #if(log): print('\nRunning Wordnet Model...')
    print("************************************************")
    print("Wordnet Model Prediction")
    print("************************************************")
    print("")
    s2 = rank_sentences_mod_wordnet(list_lineInDoc)

    # Normalization by Jeff.
    s1_norm = []
    s2_norm = []

    for score in s1:
        s1_norm.append(MaxMinNormalization(score, max(s1), min(s1)))
    for score in s2:
        s2_norm.append(MaxMinNormalization(score, max(s2), min(s2)))

    print("************************************************")
    print("Scores for the Bayesian Model and Wordnet Model")
    print("************************************************")

    print('\nBayesian Model Score List:')
    print([float("%.2f" % elem) for elem in s1])
    print('\nWordnet Model Score List:')
    print([float("%.2f" % elem) for elem in s2])

    if (log):
        print('\nBayesian Model Normalised Score List:')
        print([float("%.2f" % elem) for elem in s1_norm])
        print('\nWordnet Model Normalied Score List:')
        print([float("%.2f" % elem) for elem in s2_norm])

    # combine scores of the two methods, get rank of combined scores.
    combined = list()
    for i in range(0, len(s1_norm)):
        # Use alpha weighting
        combined.append(s1_norm[i] * alpha + s2_norm[i] * (1 - alpha))
    combined_np = np.array(combined)
    combined_sorted = combined_np.argsort()[::-1].argsort()
    if (log):
        print(
            "\nRanks of Combined Bayesian and Wordnet Model (first # is sentence 0 etc):"
        )
        print([int(1 + elem) for elem in list(combined_sorted)])

    # Finally get ranks from top to least, with sentence # in the output array.
    combined_rank_ordered = combined_sorted.argsort()

    print(
        "\nFinal sentence rank (sentence # shown): [more important --> less important]"
    )
    print([int(1 + elem) for elem in list(combined_rank_ordered)])
    print()

    ################################################################
    # Jeff: this is crazy.
    ################################################################
    final_score = np.zeros(len(s1_norm))
    for idx in range(len(s1_norm)):
        final_score[idx] = s1_norm[idx] * alpha + s2_norm[idx] * (1 - alpha)

    x = np.arange(1, len(final_score) + 1)
    y = final_score

    x1 = np.arange(1, len(s1_norm) + 1)
    y1 = s1_norm

    x2 = np.arange(1, len(s2_norm) + 1)
    y2 = s2_norm

    if (showGraph):
        pl.plot(x, y, 'r', x1, y1, 'b', x2, y2, 'g')

        pl.xlabel('Sentence')
        pl.ylabel('Normalised Score')
        pl.title('Red = Final Score, Blue = Bayseian, Green = Wordnet')
        pl.show(block=False)
    return list(combined_rank_ordered)
示例#4
0
def retTopK(
        url='http://www.austlii.edu.au/cgi-bin/viewdoc/au/cases/nsw/NSWCATCD/2017/9.html',
        log=False):

    # Read the first case in train_data.txt for testing.
    #input_path = "./model/train_data.txt"
    #list_lineInDoc, pos = readOneCase(file_pos, train_data_path=input_path)
    '''
    get catch words and sentences of a url.
    '''
    #if(log): print('\nRetrieving case sentences...')

    #analyse_url_new.get_details(url)
    catchphrase = analyse_url_new.get_catchwords()
    sentences = analyse_url_new.get_sentences()

    ## Remove '\n','\t', etc and make it readable.
    catchphrase = " ".join(catchphrase.replace("\n", " ").split())
    for idx in range(len(sentences)):
        sentences[idx] = " ".join(sentences[idx].replace("\n", " ").split())

    list_lineInDoc = []
    list_lineInDoc.append(catchphrase)
    list_lineInDoc.extend(sentences)
    '''
    list_lineInDoc is a sentence list..
    list_lineInDoc[0] is catchphrase.
    list_lineInDoc[1:n] is body sentences.
    '''
    s1 = []
    s2 = []

    print("#" * separator_len)
    print("Bayesian Model Prediction")
    print("#" * separator_len)
    print("")
    print("New law case:", url)
    print("")
    print("Extract catchwords and sentences.")

    print("Pre-processing law case.")
    print("Pre-processing catchwords and sentences.")
    print("")
    s1 = predictBayesianModel(list_lineInDoc, logPrint=log)

    print("\nScore list for each sentence:")
    print([float("%.2f" % elem) for elem in s1])
    print("")

    print("#" * separator_len)
    print("Wordnet Model Prediction")
    print("#" * separator_len)
    print("")
    print("New law case:", url)
    print("")
    print("Extract catchwords and sentences.")

    print("Pre-processing law case.")
    print("Pre-processing catchwords and sentences.")
    print("")
    s2 = rank_sentences_mod_wordnet(list_lineInDoc, logPrint=log)

    print("\nScore list for each sentence:")
    print([float("%.2f" % elem) for elem in s2])
    print("")

    print("")
    print("#" * separator_len)
    print("Mixture Model: alpha * Bayesian Model + (1-alpha) * Wordnet Model")
    print("#" * separator_len)

    # Normalization by Jeff.
    s1_norm = []
    s2_norm = []

    for score in s1:
        s1_norm.append(MaxMinNormalization(score, max(s1), min(s1)))
    for score in s2:
        s2_norm.append(MaxMinNormalization(score, max(s2), min(s2)))

    # weighting on ML. Below 0.5 due to inaccuracy of method.
    if (log):
        print(
            "\nFunction to return a ranked list of the most relevant sentences.\n"
        )
    if (log):
        print("\tAlpha = 0.25")
        print(
            "\tBayesian Model given less weighting due to its relative inaccuracy."
        )
        print()
    alpha = 0.25

    if (log):
        print("************************************************")
        print("Normalised Scores for Bayesian and Wordnet Model")
        print("************************************************")
        print('\nBayesian Model Result Normalised:')
        print([float("%.2f" % elem) for elem in s1_norm])
        print('\nWordnet Model Result Normalied:')
        print([float("%.2f" % elem) for elem in s2_norm])

    combined = list()
    for i in range(0, len(s1_norm)):
        # Use alpha weighting
        combined.append(s1_norm[i] * alpha + s2_norm[i] * (1 - alpha))
    combined_np = np.array(combined)
    combined_sorted = combined_np.argsort()[::-1].argsort()
    if (log):
        print(
            "\nRanks of Combined Bayesian and Wordnet Model (From sentence 1 to end):"
        )
        print([int(1 + elem) for elem in list(combined_sorted)])
        print()
    '''
    Previous Method: Combine by ranks instead of scores.

    # Sort the ranks for sentences so that array stays ordered by sentence #,
    # and array populated by rank of sentence #
    s1_np = np.array(s1)
    s1_sorted = s1_np.argsort()[::-1].argsort()
    s2_np = np.array(s2)
    s2_sorted = s2_np.argsort()[::-1].argsort()
    if(log):
        print("\nRanks of Bayesian Model (first # is sentence 0 etc):")
        print (list(s1_sorted))
        print("\nRanks of Wordnet Model (first # is sentence 0 etc):")
        print (list(s2_sorted))

    combined = list()
    for i in range(0, len(s1_sorted)):
        # Use alpha weighting
        combined.append(s1_sorted[i]*alpha + s2_sorted[i]*(1-alpha))    # <----- double check please.

    # Sort the combined ranks for sentences so that array stays ordered by sentence #,
    # and array populated by rank of sentence #
    combined_np = np.array(combined)
    combined_sorted = combined_np.argsort().argsort()
    if(log):
        print("\nFinal ranks - (first # is sentence 0 etc):")
        print (list(combined_sorted))
    #print([ "%d" % elem for elem in list(combined_sorted) ])
    #use enumerate
    '''
    # Finally get ranks from top to least, with sentence # in the output array.
    combined_rank_ordered = combined_sorted.argsort()
    if (log):
        print("************************************************")
        print("Final ranks (sentence # shown):")
        print("************************************************")
        print([int(1 + elem) for elem in list(combined_rank_ordered)])
        #print (list(combined_rank_ordered)[:topK])
        print()

    return list(combined_rank_ordered)