示例#1
0
def SVOS(corpus):
    tokens = nlp(corpus[1][:-1])
    svos = findSVOs(tokens)
    svos =[]
    for sent in corpus:
        tokens = nlp(sent[:-1])
        if len(findSVOs(tokens))>0:
            svos.append(findSVOs(tokens))
    return svos
示例#2
0
 def clever_translate(self, target_phrase):
     tok = subject_verb_object_extract.nlp(target_phrase)
     svos = subject_verb_object_extract.findSVOs(tok)
     vbprint(svos)
     final_sentence = []
     words_used_up = []
     if svos:
         for phrase in svos:
             for word in phrase:
                 words_used_up += word.lower().split()
             ctree = {'(S)': '', '(V)': '', '(O)': ''}
             try:
                 # Try and construct svo tree
                 ctree['(S)'] = phrase[0].split()
                 ctree['(V)'] = phrase[1].split()
                 ctree['(O)'] = phrase[2].split()
             except IndexError:
                 pass
             partial_translation_result = self.expand_phrase(ctree)
             final_sentence.append(partial_translation_result)
     else:
         # If no subject verb objects found, just brute force translate the whole thing
         return self.brute_force_translate(target_phrase)
     # If there are unused words add them as riders to the end
     additions = []
     for word in target_phrase.split():
         if word.lower() not in words_used_up:
             additions.append(word)
     final_sentence = ' '.join(final_sentence)
     final_sentence += ' '+self.brute_force_translate(' '.join(additions))
     return final_sentence
示例#3
0
 def test_svo_14(self):
     tok = nlp("the boy raced the girl who had a hat that had spots.")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(
         set(svos) == {('the boy', 'raced',
                        'the girl'), ('who', 'had',
                                      'a hat'), ('a hat', 'had', 'spots')})
示例#4
0
 def test_svo_11(self):
     tok = nlp(
         "because he hit me and also made me so angry I wanted to kill him with a hammer."
     )
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(
         set(svos) == {('he', 'hit', 'me'), ('I', 'kill', 'him')})
示例#5
0
 def test_svo_1(self):
     tok = nlp("the annoying person that was my boyfriend hit me")
     svos = findSVOs(tok)
     printDeps(tok)  # just show what printDeps() does
     self.assertTrue(
         set(svos) == {('the annoying person', 'was',
                        'my boyfriend'), ('the annoying person', 'hit',
                                          'me')})
示例#6
0
def NER_all(text):
    nlp = en_core_web_sm.load()
    doc = nlp(text)
    NERS = []
    for ent in doc.ents:
        NERS.append((ent.text))
    NERS = list(set(NERS))
    return NERS
示例#7
0
 def test_svo_13(self):
     tok = nlp("he and his brother shot me and my sister")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(
         set(svos) == {('he', 'shot', 'me'), (
             'he', 'shot',
             'my sister'), ('his brother', 'shot',
                            'me'), ('his brother', 'shot', 'my sister')})
示例#8
0
 def test_svo_2(self):
     tok = nlp(
         "making $12 an hour? where am i going to go? I have no other financial assistance available and he certainly won't provide support."
     )
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(
         set(svos) == {('I', '!have', 'other financial assistance available'
                        ), ('he', '!provide', 'support')})
示例#9
0
 def test_svo_8(self):
     tok = nlp("he is an evil man that hurt my child and sister")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(
         set(svos) == {('he', 'is',
                        'an evil man'), (
                            'an evil man', 'hurt',
                            'my child'), ('an evil man', 'hurt', 'sister')})
示例#10
0
 def test_svo_6(self):
     tok = nlp(
         "I have no other financial assistance available, and he certainly won't provide support."
     )
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(
         set(svos) == {('I', '!have', 'other financial assistance available'
                        ), ('he', '!provide', 'support')})
示例#11
0
def extractSVO(dir_path):
    if os.path.isdir(dir_path):
        print("Extracting Files from " + str(dir_path))
        for file in os.listdir(dir_path):
            if file.endswith(".txt"):
                file_string = open(os.path.join(dir_path, file)).read()
                tokens = nlp(file_string)
                svos = findSVOs(tokens)
                tuplesToFile(svos)

    else:
        print(dir_path + " is not a directory")
示例#12
0
 def test_svo_25(self):
     tok = nlp(
         "Seated in Mission Control, Chris Kraft neared the end of a tedious Friday afternoon as he monitored a seemingly interminable ground test of the Apollo 1 spacecraft."
     )
     # printDeps(tok)
     svos = findSVOs(tok)
     self.assertTrue(
         set(svos) ==
         {('Chris Kraft', 'neared',
           'the end of a tedious Friday afternoon'),
          ('he', 'monitored',
           'a interminable ground test of the Apollo spacecraft')})
示例#13
0
def extractSVO(dir_path):
    if os.path.isdir(dir_path):
        logging.info("Extracting Files from " + str(dir_path))
        for file in os.listdir(dir_path):
            if file.endswith(".txt"):
                logging.info("Parsing file {}".format(file))
                file_string = open(os.path.join(dir_path, file)).read()
                tokens = nlp(file_string)
                svos = findSVOs(tokens)
                tuplesToFile(svos)
            else:
                logging.info("Skipping file {}".format(file))

    else:
        logging.fatal(dir_path + " is not a directory")
示例#14
0
 def process_word(self, word):
     return subject_verb_object_extract.nlp(word)
示例#15
0
 def test_svo_10(self):
     tok = nlp("I wanted to kill him with a hammer.")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(set(svos) == {('I', 'kill', 'him')})
示例#16
0
 def test_svo_9(self):
     tok = nlp(
         "he told me i would die alone with nothing but my career someday")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(set(svos) == {('he', 'told', 'me')})
示例#17
0
 def test_svo_22(self):
     tok = nlp("he beat and hurt me")
     # printDeps(tok)
     svos = findSVOs(tok)
     self.assertTrue(
         set(svos) == {('he', 'beat', 'me'), ('he', 'hurt', 'me')})
示例#18
0
import sys
from subject_verb_object_extract import findSVOs, nlp

str1 = ''
for word in sys.argv[1:]:
    str1 += word + ' '

# str1 = "Then there’s a development setback on top of that that pushes you even further back."

tokens1 = nlp(str1)
svos1 = findSVOs(tokens1)
print(svos1)
示例#19
0
 def test_svo_4(self):
     tok = nlp("They ate the pizza with anchovies.")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(set(svos) == {('They', 'ate', 'the pizza')})
示例#20
0
from subject_verb_object_extract import findSVOs, printDeps, nlp

tok = nlp("expert spacy users are very kind to dogs")
svos = findSVOs(tok)
printDeps(tok)
print(svos)

示例#21
0
 def test_svo_16(self):
     tok = nlp("he didn't spit on me")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(set(svos) == {('he', '!spit', 'me')})
示例#22
0
 def test_svo_24(self):
     tok = nlp("lessons were taken by me")
     # printDeps(tok)
     svos = findSVOs(tok)
     self.assertTrue(set(svos) == {('me', 'take', 'lessons')})
示例#23
0
 def test_svo_23(self):
     tok = nlp("I was beaten by him")
     # printDeps(tok)
     svos = findSVOs(tok)
     self.assertTrue(set(svos) == {('him', 'beat', 'I')})
示例#24
0
from subject_verb_object_extract import findSVOs, nlp

str1 = "Then there’s a development setback on top of that that pushes you even further back."
str2 = "And that goes with that we’re going to do things differently, but we haven’t done that yet."
str3 = "Seated in Mission Control, Chris Kraft neared the end of a tedious Friday afternoon as he monitored a " \
       "seemingly interminable ground test of the Apollo 1 spacecraft."

tokens1 = nlp(str1)
svos1 = findSVOs(tokens1)
print("\n1")
print(str1)
print(svos1)

tokens2 = nlp(str2)
svos2 = findSVOs(tokens2)
print("\n2")
print(str2)
print(svos2)

tokens3 = nlp(str3)
svos3 = findSVOs(tokens3)
print("\n3")
print(str3)
print(svos3)
示例#25
0
from subject_verb_object_extract import findSVOs, printDeps, nlp

tok = nlp("expert spacy users are very kind to dogs")
svos = findSVOs(tok)
printDeps(tok)
print(svos)

tok = nlp("both sides should understand that")
svos = findSVOs(tok)
printDeps(tok)
print(svos)
示例#26
0
 def test_svo_7(self):
     tok = nlp("he did not kill me")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(set(svos) == {('he', '!kill', 'me')})
示例#27
0
def roles(sentences):
    my_svo_triplet = []
    all_nodes = []
    for i in range(len(sentences)):
        # public SRL model https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz
        predictor = Predictor.from_path("srl-model.tar.gz")
        predictions = predictor.predict(sentences[i])
        lst = []
        nodes = []
        for k in predictions['verbs']:
            if k['description'].count('[') > 1:
                lst.append(k['description'])
        for jj in range(len(lst)):
            nodes.append([])
            for j in re.findall(r"[^[]*\[([^]]*)\]", lst[jj]):
                nodes[jj].append(j)
        print("*****sentence:", sentences[i], '*****nodes: ', nodes)

        for lis_ in nodes:
            for indx in range(len(lis_)):
                if lis_[0].split(
                        ":", 1)[0].lower().strip() == "v" and lis_[0].split(
                            ":", 1)[1].lower().strip() in main_verbs:
                    n = len(lis_)
                    for j in range(1, len(lis_)):
                        if lis_[j].split(":", 1)[0].lower() != "v":
                            if len(iocs.list_of_iocs(lis_[j].split(":",
                                                                   1)[1])) > 0:
                                lis_.insert(0, " ARG-NEW: *")

        maxlength = 0
        if nodes:
            maxlength = max((len(i) for i in nodes))
        if nodes == [] or maxlength < 3:
            print("****DP SVO****")
            tokens = nlp(sentences[i])
            svos = findSVOs(tokens)
            if svos:
                for sv in range(len(svos)):
                    if len(svos[sv]) == 3:
                        print('Dependency SVO(s):', [
                            "ARG0: " + svos[sv][0], "V: " + svos[sv][1],
                            "ARG1: " + svos[sv][2]
                        ])
                        nodes.append([
                            "ARG0: " + svos[sv][0], "V: " + svos[sv][1],
                            "ARG1: " + svos[sv][2]
                        ])
            print("Dependency-SVO added nodes: ", nodes)

            print("****Naive SVO****")
            breakers = []
            subj, obj = '', ''
            doc = nlp(sentences[i])
            for token in doc:
                if token.pos_ == 'VERB':
                    breakers.append(str(token))
            if len(breakers) != 0:
                for vb in breakers:
                    subj = "subj: " + sentences[i].split(vb)[0]
                    obj = "obj: " + sentences[i].split(vb)[1]
                    vrb = "v: " + vb
                    lst = []
                    lst.append(subj)
                    lst.append(vrb)
                    lst.append(obj)
                    nodes.append(lst)
            print("Naive Nodes: ", nodes)

        if nodes != []:
            zero_dunplicate_removed = []
            for i in nodes:
                zero_dunplicate_removed.append(list(dict.fromkeys(i)))
            no_zero_nodes = []
            for i in zero_dunplicate_removed:
                if '.' in i:
                    i.remove('.')
                    no_zero_nodes.append(i)
                else:
                    no_zero_nodes = zero_dunplicate_removed

            no_zero_nodes_plus_3 = []
            for i in no_zero_nodes:
                if len(i) > 2:
                    no_zero_nodes_plus_3.append(i)

            removeable_items_list = [
                'both', 'also', 'that', 'would', 'could', 'immediately',
                'usually', 'for', 'when', 'then', 'will', 'which', 'first',
                'second', 'third', 'forth', 'fifth', 'internally', 'where',
                'while', 'either', 'nither', 'when', 'sever', 'successfully',
                'also', 'to', 'above', 'already', 'recently', 'may', 'however',
                'can', 'once loaded', 'in fact', 'in this way', 'all',
                'actually', 'inadvertently', 'instead',
                'when copying themselves', 'automatically', 'should', 'can',
                'could', 'necessarily', 'if found', 'randomly', 'again',
                'still', 'generally', 'slowly', 'ever', 'shall', 'newly',
                'However', 'when executed', 'subsequently'
            ]

            #lammarizer
            for i in range(len(no_zero_nodes_plus_3)):
                for index, item in enumerate(no_zero_nodes_plus_3[i]):
                    if item.split(': ')[0] == 'V':
                        word = item.split(': ')[1]
                        no_zero_nodes_plus_3[i][
                            index] = "V: " + WordNetLemmatizer().lemmatize(
                                item.split(": ")[1].lower(), 'v')

            for i in range(len(no_zero_nodes_plus_3)):

                if no_zero_nodes_plus_3[i]:
                    for index, item in enumerate(no_zero_nodes_plus_3[i]):

                        if 'ARGM-MOD:' in item:
                            if item.split(
                                    ': ',
                                    1)[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'ARGM-ADV:' in item:
                            if item.split(
                                    ': ',
                                    1)[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'ARGM-TMP:' in item:
                            if item.split(
                                    ': ',
                                    1)[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'ARGM-MNR:' in item:
                            if item.split(
                                    ': ',
                                    1)[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'R-ARG1:' in item:
                            if item.split(
                                    ': ',
                                    1)[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'R-ARG0:' in item:
                            if item.split(
                                    ': ',
                                    1)[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'ARGM-DIS:' in item:
                            if item.split(
                                    ': ',
                                    1)[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'ARGM-PRP:' in item:
                            if item.split(
                                    ': ',
                                    1)[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

            for i in range(len(no_zero_nodes_plus_3)):
                if no_zero_nodes_plus_3[i]:
                    for index, item in enumerate(no_zero_nodes_plus_3[i]):

                        if 'ARGM-MOD:' in item:
                            if item.split(
                                    ': ')[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'ARGM-ADV:' in item:
                            if item.split(
                                    ': ')[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'ARGM-TMP:' in item:
                            if item.split(
                                    ': ')[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'ARGM-MNR:' in item:
                            if item.split(
                                    ': ')[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'R-ARG1:' in item:
                            if item.split(
                                    ': ')[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

                        if 'R-ARG0:' in item:
                            if item.split(
                                    ': ')[1].lower() in removeable_items_list:
                                del no_zero_nodes_plus_3[i][index]
                            else:
                                print("##### NEW Exception: ", item)

            v_unlink = [
                'delete', 'clear', 'remove', 'erase', 'wipe', 'purge',
                'expunge'
            ]
            v_write = [
                'entrench', 'exfiltrate', 'store', 'drop', 'drops', 'install',
                'place', 'deploy', 'implant', 'write', 'putfile', 'compose',
                'create', 'creates', 'copy', 'copies', 'save', 'saved',
                'saves', 'add', 'adds', 'modify', 'modifies', 'append',
                'appends'
            ]
            v_read = [
                'survey', 'download', 'navigate', 'locate', 'read', 'gather',
                'extract', 'extracts', 'obtain', 'acquire', 'check', 'checks',
                'detect', 'detects', 'record', 'records'
            ]
            v_exec = [
                'use', 'execute', 'executed', 'run', 'ran', 'launch', 'call',
                'perform', 'list', 'invoke', 'inject', 'open', 'opened',
                'target', 'resume', 'exec'
            ]
            v_mmap = ['allocate', 'assign']
            v_fork = [
                'clone', 'clones', 'spawned', 'spawn', 'spawns', 'issue', 'set'
            ]
            v_setuid = ['elevate', 'impersonated']
            v_send = [
                'send', 'sent', 'transfer', 'post', 'postsinformation',
                'postsinformations', 'move', 'transmit', 'deliver', 'push',
                'redirect', 'redirects'
            ]
            v_receive = ['receive', 'accept', 'take', 'get', 'gets', 'collect']
            v_connect = [
                'click', 'browse', 'browses', 'connect', 'connected',
                'portscan', 'connects', 'alerts', 'communicates', 'communicate'
            ]
            v_chmod = [
                'chmod', 'change permission', 'changes permission',
                'permision-modifies', 'modifies permission',
                'modify permission'
            ]
            v_load = ['load', 'loads']
            v_exit = [
                'terminate', 'terminates', 'stop', 'stops', 'end', 'finish',
                'break off', 'abort', 'conclude'
            ]
            v_2D = {'collect': ('read', 'receive'), 'open': ('exec', 'fork')}

            for i in range(len(no_zero_nodes_plus_3)):
                for index, item in enumerate(no_zero_nodes_plus_3[i]):
                    if item.split(': ')[0] == 'V':
                        if item.split(': ')[1] in v_unlink:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'unlink'
                        elif item.split(': ')[1] in v_write:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'write'
                        elif item.split(': ')[1] in v_read:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'read'
                        elif item.split(': ')[1] in v_exec:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'exec'
                        elif item.split(': ')[1] in v_mmap:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'mmap'
                        elif item.split(': ')[1] in v_fork:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'fork'
                        elif item.split(': ')[1] in v_setuid:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'setuid'
                        elif item.split(': ')[1] in v_send:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'send'
                        elif item.split(': ')[1] in v_receive:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'receive'
                        elif item.split(': ')[1] in v_connect:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'connect'
                        elif item.split(': ')[1] in v_chmod:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'chmod'
                        elif item.split(': ')[1] in v_load:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'load'
                        elif item.split(': ')[1] in v_exit:
                            no_zero_nodes_plus_3[i][index] = 'V: ' + 'exit'
        else:
            continue
        all_nodes += no_zero_nodes_plus_3
        if my_svo_triplet:
            all_nodes += my_svo_triplet
    print('*****all_nodes:::', all_nodes)
    return all_nodes
示例#28
0
 def test_svo_21(self):
     tok = nlp("he didn't spit on me nor my child")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(
         set(svos) == {('he', '!spit', 'me'), ('he', '!spit', 'my child')})
示例#29
0
        return True
    return False
 
html = urllib.request.urlopen()
soup = BeautifulSoup(html)
data = soup.find("div", {"class": className})
paras = data.findAll("p")
paras = [o.text for o in paras]

nlp = spacy.load('en_core_web_lg')

# load NeuralCoref and add it to the pipe of SpaCy's model
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

paras = [nlp(para)._.coref_resolved for para in paras]


# For SVO extraction: less accurate
# allsvos = []
# for para in paras:
#     tokens = nlp(sent)
#     svos = findSVOs(tokens)
#     allsvos.extend(svos)


testData = []
for para in paras:
    instances = getInstances(para)
    ls = [(word, getSim("_".join(word.split(" ")), thresholdWord)) for word in list(set([a.lower() for a in instances]))]
    ls = list(set(list(itertools.combinations(filter_dissimilar(ls), 2))))
示例#30
0
 def test_svo_3(self):
     tok = nlp("I don't have other assistance")
     svos = findSVOs(tok)
     # printDeps(tok)
     self.assertTrue(set(svos) == {('I', '!have', 'other assistance')})