Exemplo n.º 1
0
print("\nPOS tags")
print(pos_tags)

listchunked = strchunked.split()
# print(len(listchunked))
# print(len(pos_tags))
print("\n\n")
ph = ""
sentence = []

k = 0
while k < len(listchunked):
    if listchunked[k + 1] == '<S-NP>' or listchunked[
            k + 1] == '<S-VP>' or listchunked[k + 1] == '<S-PP>':
        ph = listchunked[k]
        sentence.append([(ph), (listchunked[k + 1][-3:-1])])
        ph = ""
        k += 2
        # print("S")
    elif listchunked[k + 1] == '<B-NP>':
        # ph = ph + listchunked[k]
        while listchunked[k + 1] != '<E-NP>':
            ph = ph.strip() + " " + listchunked[k]
            k += 2
        ph = ph.strip() + " " + listchunked[k]
        sentence.append([(ph), (listchunked[k + 1][-3:-1])])
        ph = ""
        k += 2
        # print("BNP")
    elif listchunked[k + 1] == '<B-VP>':
        while listchunked[k + 1] != '<E-VP>':
def getTagged(ex, show, main_triplet):
    sentence = Sentence(ex)

    tagger.predict(sentence)
    strchunked = sentence.to_tagged_string()
    # if(show):
    # 	# print("\n")
    # 	print(sentence)
    # 	print("\nChunked sentence")
    # print(strchunked)

    nlp = en_core_web_sm.load()
    doc = nlp(ex)
    # pos_tags = [(i, i.tag_) for i in doc]
    pos_tags = nltk.pos_tag(nltk.word_tokenize(ex))

    listchunked = strchunked.split()
    # print(len(listchunked))
    # print(len(pos_tags))
    ph = ""
    sentence = []

    k = 0
    while k + 1 < len(listchunked):
        if listchunked[k + 1] == '<S-NP>' or listchunked[
                k + 1] == '<S-VP>' or listchunked[k + 1] == '<S-PP>':
            ph = listchunked[k]
            sentence.append([(ph), (listchunked[k + 1][-3:-1])])
            ph = ""
            k += 2
            # print("S")
        elif listchunked[k + 1] == '<B-NP>':
            # ph = ph + listchunked[k]
            while (k + 1 < len(listchunked)
                   and listchunked[k + 1] != '<E-NP>'):
                if not (listchunked[k][0] == '<'
                        and listchunked[k][-1] == '>'):
                    ph = ph.strip() + " " + listchunked[k]
                k += 1
            ph = ph.strip() + " " + listchunked[k]
            sentence.append([ph, 'NP'])
            ph = ""
            k += 2
            # print("BNP")
        elif listchunked[k + 1] == '<B-VP>':
            while (k + 1 < len(listchunked)
                   and listchunked[k + 1] != '<E-VP>'):
                if not (listchunked[k][0] == '<'
                        and listchunked[k][-1] == '>'):
                    ph = ph.strip() + " " + listchunked[k]
                k += 1
            ph = ph.strip() + " " + listchunked[k]
            sentence.append([ph, 'VP'])
            ph = ""
            k += 2
            # print(BVP)
        elif not (listchunked[k + 1][0] == '<'
                  and listchunked[k + 1][-1] == '>'):  #happens with 'CC'
            sentence.append([listchunked[k], 'CC'])
            k += 1
        else:
            k += 2
            # print("here")

    # if(show):
    # 	# print("\n\n")
    # 	# print("CHUNKS from spacy")
    # 	for x in sentence:
    # 		print(x)

    tf = False
    k = m = 0
    sentence2 = []
    while k < len(sentence):
        ph = ""
        for x in nltk.word_tokenize(sentence[k][0]):
            m2 = m  #old value of m
            while m < len(pos_tags):  #find the pos tag of x
                if (x.strip('.') == pos_tags[m][0].strip('.')
                    ):  #found the pos tag of x
                    if (pos_tags[m][1] == "RB" or pos_tags[m][1] == "DT"
                            or pos_tags[m][1] == "."
                            or pos_tags[m][1] == "``"):
                        break
                    if (pos_tags[m][1] == ","):
                        if (len(ph) > 0):
                            sentence2.append([ph.strip(), sentence[k][1]])
                            ph = ""
                    else:
                        ph = ph.strip() + " " + x + "^" + pos_tags[m][1]
                    m += 1
                    break
                # elif (x == "have"):
                # 	print(pos_tags[m], m)
                m += 1
            if (m == len(pos_tags) and m2 != len(pos_tags)):
                m = m2
        if (len(ph) > 0):
            sentence2.append([ph.strip(), sentence[k][1]])
        elif (sentence[k][0][-1] != ',' and ' ' not in sentence[k][0]):
            sentence2.append([sentence[k][0] + "^NN", sentence[k][1]])
        k += 1

    # if(show):
    # print("\n\n")
    # print("CHUNKS with POS tags")

    sentence = sentence2

    k = 0
    while k < len(sentence):
        ph = sentence[k][0].split()
        p = 0
        vbfound = False
        s = ""
        while p < len(ph):
            try:
                fflag = ((re.search(r'.*\^', ph[p]).group()[:-1] == "'s"
                          or re.search(r'.*\^', ph[p]).group()[:-1] == "'")
                         and len(re.search(r'.*\^', ph[p]).group()[:-1]) <= 2)
            except:
                # print(sentence)
                input('ERROR')
            if ((re.search(r'.*\^', ph[p]).group()[:-1] == "'s"
                 or re.search(r'.*\^', ph[p]).group()[:-1] == "'")
                    and len(re.search(r'.*\^', ph[p]).group()[:-1]) <= 2):
                if (p - 2 >= 0):
                    s = ' '.join(ph[:p - 1])
                s = s + " " + re.search(
                    r'.*\^', ph[p - 1]).group()[:-1] + "'s^POS " + ' '.join(
                        ph[p + 1:])
                # print(s)
                # input('ENTER')
                sentence[k] = (s.strip(), sentence[k][1])
                ph = sentence[k][0].split()
                p -= 1
                s = ""
            # if ("VB" in re.search(r'\^.*',ph[p]).group()[1:]):
            # 	if not(vbfound):
            # 		vbfound = True
            # 	else:
            # 		s = ' '.join(ph[p:])
            # 		sentence[k] = (s.strip(), sentence[k][1])
            if (re.search(r'\^.*', ph[p]).group()[1:][0] == 'W'):
                sentence = sentence[:k] + (sentence[k + 1:]
                                           if k + 1 < len(sentence) else [])
                k -= 1
                break
            # if ("NN" in re.search(r'\^.*',ph[p]).group()[1:]):

            p += 1
        k += 1
    if (show):
        print("\n\n\tINTIAL CHUNK AND POS SENTENCE")
        for x in sentence:
            print(x)

    sentence_pos = sentence
    # print("SP", sentence_pos)
    # tripletsADDON = []
    for x in sentence_pos:
        # print(x, len(x), x[0], len(x[0].split()))
        if (len(x[0].split()) > 1):
            # print("Use this", x[0])
            a = x[0].split()
            list_fin = []
            sen = " "
            for aa in a:
                list_aa = []
                # print("AAA", aa)
                aaaa = aa.split('^')[0]  #word
                sen = sen + " " + aaaa
                # print("SEN", sen)
                bbbb = aa.split('^')[1]  #pos tag
                list_aa.append(aaaa)
                list_aa.append(bbbb)
                # print("LIST_AA", list_aa)
                tuple_aa = tuple(list_aa)
                # print("TUUU", tuple_aa)
                list_fin.append(tuple_aa)
            sen = sen.lstrip()
            print("SEN", sen)
            # trip = [['He', 'bought', 'four new cars']]
            triplet_sentence = []
            for tr in main_triplet:
                for t in tr:
                    if sen == t:
                        break
                    else:
                        triplet_sentence.append(t)
            print(triplet_sentence)

            # print("\n")
            print("INPUT REQUIRED:", list_fin)
            # getBrokenTriplets(list_fin, sen)

    # print(tripletsADDON)
    return list_fin, sen
def getPhrases(ex, tagger):
    ex = ex.strip().strip('.').strip('!').replace('‘', '\'').replace(
        '’', '\'').replace('“', '"').replace('”', '"')
    sentence = Sentence(ex)
    tagger.predict(sentence)
    listchunked = sentence.to_tagged_string().split()
    ph = ""
    sentence = []

    k = 0
    while k + 1 < len(listchunked):
        if listchunked[k + 1] == '<S-NP>' or listchunked[
                k + 1] == '<S-VP>' or listchunked[k + 1] == '<S-PP>':
            ph = listchunked[k]
            sentence.append([(ph), (listchunked[k + 1][-3:-1])])
            ph = ""
            k += 2
            # print("S")
        elif listchunked[k + 1] == '<S-ADJP>':
            ph = listchunked[k]
            sentence.append([(ph), ('NP')])
            ph = ""
            k += 2
        elif listchunked[k + 1] == '<S-PRT>':
            ph = listchunked[k]
            sentence.append([(ph), ('PP')])
            ph = ""
            k += 2
        elif listchunked[k + 1] == '<B-NP>':
            # ph = ph + listchunked[k]
            while (k + 1 < len(listchunked)
                   and listchunked[k + 1] != '<E-NP>'):
                if not (listchunked[k][0] == '<'
                        and listchunked[k][-1] == '>'):
                    ph = ph.strip() + " " + listchunked[k]
                k += 1
            ph = ph.strip() + " " + listchunked[k]
            sentence.append([ph, 'NP'])
            ph = ""
            k += 2
            # print("BNP")
        elif listchunked[k + 1] == '<B-VP>':
            while (k + 1 < len(listchunked)
                   and listchunked[k + 1] != '<E-VP>'):
                if not (listchunked[k][0] == '<'
                        and listchunked[k][-1] == '>'):
                    ph = ph.strip() + " " + listchunked[k]
                k += 1
            ph = ph.strip() + " " + listchunked[k]
            sentence.append([ph, 'VP'])
            ph = ""
            k += 2
            # print(BVP)
        elif listchunked[k + 1] == '<B-ADJP>':
            while (k + 1 < len(listchunked)
                   and listchunked[k + 1] != '<E-ADJP>'):
                if not (listchunked[k][0] == '<'
                        and listchunked[k][-1] == '>'):
                    ph = ph.strip() + " " + listchunked[k]
                k += 1
            ph = ph.strip() + " " + listchunked[k]
            sentence.append([ph, 'NP'])
            ph = ""
            k += 2
        elif not (listchunked[k + 1][0] == '<'
                  and listchunked[k + 1][-1] == '>'):  #happens with 'CC'
            sentence.append([listchunked[k], 'CC'])
            k += 1
        else:
            sentence.append(['REMOVE THIS', 'XX'])
            k += 2
            # print("here")

    # print("temp chunks")
    # for x in sentence:
    # 	print(x)
    k = 0
    while k < len(sentence):
        while (k < len(sentence) and sentence[k][0].find(',') != -1):
            index = sentence[k][0].find(',')
            if (sentence[k][0][index - 1] >= '0'
                    and sentence[k][0][index - 1] <= '9'
                    and index + 1 < len(sentence[k][0])
                    and sentence[k][0][index + 1] >= '0'
                    and sentence[k][0][index + 1] <= '9'):
                break
            ph = list(sentence[k])

            sentence[k][0] = sentence[k][0][:index]

            if (k + 1 >= len(sentence)):
                sentence = sentence + [[",", "CC"]] + ([[
                    ph[0][index + 1:].strip(), ph[1]
                ]] if ph[0][index + 1:].strip() != "" else [])
            else:
                sentence = sentence[:k + 1] + [[",", "CC"]] + (
                    [[ph[0][index + 1:].strip(), ph[1]]] if
                    ph[0][index + 1:].strip() != "" else []) + sentence[k + 1:]

            k += 2
        k += 1

    k = 0
    while k + 1 < len(
            sentence):  #this loop merges consecutive PP like "than/PP in/PP"
        if (sentence[k][1] == "PP" and sentence[k + 1][1] == "PP"):
            sentence[k + 1][0] = sentence[k][0] + " " + sentence[k + 1][0]
            sentence = sentence[:k] + sentence[k + 1:]
            k -= 1
        elif (sentence[k][1] == "NP"
              and (sentence[k + 1][1] == "NP" or sentence[k + 1][0] == "era")):
            sentence[k + 1][0] = sentence[k][0] + " " + sentence[k + 1][0]
            sentence = sentence[:k] + sentence[k + 1:]
            k -= 1
        k += 1

    k = 0
    while k < len(sentence):
        if (sentence[k][1] == "NP"):
            if (sentence[k][0].find("and ") == 0):
                sentence[k][0] = sentence[k][0].replace("and ", "")
            if ("it's " in sentence[k][0]):
                sentence[k][0] = sentence[k][0].replace("it's ", "")
        k += 1

    k = 0
    while k < len(sentence):
        if (len(sentence[k][0]) == 0 or sentence[k][1] == "XX"):
            sentence = sentence[:k] + (sentence[k + 1:]
                                       if k + 1 < len(sentence) else [])
        k += 1

    if len(sentence) > 0 and sentence[-1][1] == "CC":
        sentence = sentence[:-1]
    # k=0
    # while k<len(sentence): #this loop was designed to change possesion tags to "of" PP
    # 	if("'s " in sentence[k][0] and sentence[k][1] == "NP"):
    # 		index = sentence[k][0].find("'s ")
    # 		ph = sentence[k][0]
    # 		sentence[k][0] = sentence[k][0][index+2:].strip()
    # 		ph = ph[:index].strip()
    # 		sentence = sentence[:k+1] +[["of","PP"],[ph,"NP"]] + sentence[k+1:]
    # 	k+=1

    return sentence