示例#1
0
def nucleus_find_cand(dict_aword, nucleus_threshold):
    dict_occ_cand = {}
    for shortshape, windows in dict_aword.items():
        count_s1 = 0  #Meme mot schema et même CAND
        count_s2 = 0  #Meme mot schema et CAND differents
        count_s3 = 0  #Mot schema different et même CAND
        count_s4 = 0  #Mot schema different et CAND different
        for window in windows:
            linkword = ana_useful.which_linkword(window)
            cand = ana_useful.which_cand(window)

            for window1 in windows:
                if window1 != window:
                    linkword1 = ana_useful.which_linkword(window1)
                    cand1 = ana_useful.which_cand(window1)
                    if linkword[1] == linkword1[1] and cand[2] == cand1[2]:
                        count_s1 += 1
                    elif linkword[1] == linkword1[1] and cand[2] != cand1[2]:
                        count_s2 += 1
                    elif linkword[1] != linkword1[1] and cand[2] == cand1[2]:
                        count_s3 += 1
                    elif linkword[1] != linkword1[1] and cand[2] != cand1[2]:
                        count_s4 += 1
        if count_s1 / 2 >= nucleus_threshold[
                0] or count_s2 / 2 >= nucleus_threshold[
                    1] or count_s3 / 2 >= nucleus_threshold[
                        2] or count_s4 / 2 >= nucleus_threshold[3]:
            for window in windows:
                for occurrence in window:
                    if occurrence[2] == 't':
                        dict_occ_cand.setdefault(shortshape,
                                                 []).append(occurrence)
    return dict_occ_cand
示例#2
0
def nucleus_find_cand(dict_aword, nucleus_threshold):
    dict_occ_cand = {}
    for shortshape, windows in dict_aword.items():
        count_s1 = 0 #Meme mot schema et même CAND
        count_s2 = 0 #Meme mot schema et CAND differents
        count_s3 = 0 #Mot schema different et même CAND
        count_s4 = 0 #Mot schema different et CAND different
        for window in windows:
            linkword = ana_useful.which_linkword(window)
            cand = ana_useful.which_cand(window)

            for window1 in windows:
                if window1 != window:
                    linkword1 = ana_useful.which_linkword(window1)
                    cand1 = ana_useful.which_cand(window1)
                    if linkword[1] == linkword1[1] and cand[2] == cand1[2]:
                        count_s1 += 1
                    elif linkword[1] == linkword1[1] and cand[2] != cand1[2]:
                        count_s2 += 1
                    elif linkword[1] != linkword1[1] and cand[2] == cand1[2]:
                        count_s3 += 1
                    elif linkword[1] != linkword1[1] and cand[2] != cand1[2]:
                        count_s4 += 1
        if count_s1/2 >= nucleus_threshold[0] or count_s2/2 >= nucleus_threshold[1] or count_s3/2 >= nucleus_threshold[2] or count_s4/2 >= nucleus_threshold[3]:
            for window in windows:
                for occurrence in window:
                    if occurrence[2] == 't':
                        dict_occ_cand.setdefault(shortshape, []).append(occurrence)
    return dict_occ_cand
示例#3
0
def expression_valid_windows(windows, candidate):
    valid_window = []
    valid_windows = []
    buff = []

    for window in windows:
        if (ana_useful.exists_linkword(window) == True
                and ana_useful.count_cand(window) == 2):
            if ana_useful.is_cand(
                    window[-1]):  #list[-1] returns last item of the list
                cand2 = ana_useful.which_cand([window[-1]])
                if cand2[2] not in candidate.split(
                ):  #to avoid building expression like "bâtiment de cet ensemble de bâtiments" -> "batiment de bâtiment"
                    buff.append(
                        window
                    )  #dans ce cas la fenetre valide est de type (CAND1 + "aword" + CAND2) avec un mot de schéma quelque part.
                # in the buffer because we need to know if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression)
            else:
                short_window = ana_useful.cut_window(window, 2)
                #Puisqu'on a 2 CAND et que la fenetre fait 3 mots et que le dernier mot n'est pas un CAND alors la fenetre était de type CAND + CAND + mot quelconque
                if ana_useful.exists_linkword(short_window) == True:
                    valid_windows.append(
                        short_window
                    )  #dans ce cas la fenetre valide est de type (CAND1 + CAND2) avec un mot de schéma entre eux .

    # check if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression)
    valid_windows.extend(not_expa_inside_expre(buff))
    return valid_windows
示例#4
0
def expression_valid_windows(windows, candidate):
    valid_window = []
    valid_windows = []
    buff = []

    for window in windows:
        if (ana_useful.exists_linkword(window) == True and ana_useful.count_cand(window) == 2):
            if ana_useful.is_cand(window[-1]): #list[-1] returns last item of the list
                cand2 = ana_useful.which_cand([window[-1]])
                if cand2[2] not in candidate.split(): #to avoid building expression like "bâtiment de cet ensemble de bâtiments" -> "batiment de bâtiment"
                    buff.append(window) #dans ce cas la fenetre valide est de type (CAND1 + "aword" + CAND2) avec un mot de schéma quelque part.
                # in the buffer because we need to know if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression)
            else:
                short_window = ana_useful.cut_window(window, 2)
                #Puisqu'on a 2 CAND et que la fenetre fait 3 mots et que le dernier mot n'est pas un CAND alors la fenetre était de type CAND + CAND + mot quelconque
                if ana_useful.exists_linkword(short_window) == True:
                    valid_windows.append(short_window) #dans ce cas la fenetre valide est de type (CAND1 + CAND2) avec un mot de schéma entre eux .

    # check if the aword in center is not the same 3 times or more (this case it would be better to build an expansion first, then an expression)
    valid_windows.extend(not_expa_inside_expre(buff))
    return valid_windows