def coref_the_following_colon(stri): sentence2 = ' ' final_txt = '' fl = len(final_txt) # list1 = the_following_colon_lst() list1 = load_lists(fpath)['TFCL'] list1 = list1.replace("'", "").strip('][').split(', ') sentences = sent_tokenize(stri) l = len(sentences) c = 0 for sentence in sentences: c += 1 for value in list1: if value in sentence: sentence.strip( ) # to get ride of possible space at the end of sentence if sentence[-1] == ".": sentence = sentence[:-1] # removes the dot from the end if ":" in sentence: one = sentence.split(value)[0] two = sentence.split(value)[1] # sentence2 = sentence.split(":")[0].replace(value[:-1],sentence.split(":",1)[1]) + ". " # replace the token with value # sentence2 = sentence.replace(value, sentence.split(value)[1]) + ". " # replace the token with value sentence2 = sentence.replace(value, " ") + ". " final_txt += " " + sentence2 p = final_txt fl += 1 break if c > fl: final_txt += " " + sentence fl += 1 return final_txt
def colon_seprator_multiplication(stri): coref_reference_list = load_lists(fpath)['TFL'] coref_reference_list = coref_reference_list.replace( "'", "").strip('][').split(', ') stri = stri.rstrip() stri = stri.rstrip('.') result = "" for item in sent_tokenize(stri): flag = False for refrence in coref_reference_list: if refrence in item and ":" in item: sentence_splits = item.split(":", 1) y = iocs.list_of_iocs(item.split(":", 1)[1]) if y: sentence_replicas = [sentence_splits[0].rstrip(":") ] * len(y) for i in range(len(sentence_replicas)): result += sentence_replicas[i].replace(refrence, y[i]) + " . " else: item = sentence_splits[0].replace(refrence, sentence_splits[1]) result += item flag = True break if flag == False: result += item result += " " if result.rstrip("")[-1] != ".": result += "." return result
def astriks(lis): apps_process = load_lists(fpath)['APPs-PROCESS'] apps_process = apps_process.replace("'", "").strip('][').split(' , ') updated_list = [[] for x in range(len(lis))] for jj, lst in enumerate(lis): for i in range(len(lst)): if ":" not in lst[i]: lst[i] = "TMP: " + lst[i] leftnode = lst[i].split(":", 1)[0] rightnode = lst[i].split(":", 1)[1] lOFioc = iocs.list_of_iocs(rightnode) found_app = [ app for app in apps_process if app in rightnode.lower() ] if len(found_app) > 1: found_app = list(group_partials(found_app)) found_app = process_convert(found_app) else: found_app = process_convert(found_app) if not lOFioc: if leftnode.lower() != "v" and not found_app: updated_list[jj].append(leftnode + ": *") elif leftnode.lower() == "v": updated_list[jj].append(lst[i]) elif leftnode.lower() != "v" and found_app: for process in found_app: updated_list[jj].append(leftnode + ": " + process) elif len(lOFioc) == 1: updated_list[jj].append(leftnode + ": " + lOFioc[0]) elif len(lOFioc) >= 2: updated_list[jj].append(leftnode + ": " + lOFioc[0]) for index in range(1, len(lOFioc)): updated_list[jj].append("ARG-NEW: " + lOFioc[index]) return updated_list
def on_the_windows_x_only(): # on_the_windows_x_list = load_lists_microsoft.on_the_windows_x_lst() on_the_windows_x_list = load_lists(fpath)['MS_OTW'] on_the_windows_x_list = on_the_windows_x_list.replace( "'", "").strip('][').split(', ') lst = perform_following_action() for i in lst: for j in on_the_windows_x_list: if j == i: lst.remove(i) # break return lst
def perform_following_action( ): # When Virus:Win32/Funlove.4099 runs, it performs the following actions: perform_following_action_list = load_lists(fpath)['MS_PFA'] perform_following_action_list = perform_following_action_list.replace( "'", "").strip('][').split(', ') lst = remove_analysis_by() for i in lst: for j in perform_following_action_list: if j in i: lst.remove(i) break return lst
def zero_word_verb(stri): doc = nlp(stri.strip()) main_verbs = load_lists(fpath)['verbs'] main_verbs = main_verbs.replace("'", "").strip('][').split(', ') if not (doc[0].tag_ == "MD") and\ not (doc[0].tag_ == "VB" and str(doc[0]).lower() in main_verbs) and\ not (doc[0].tag_ == "VB" and str(doc[0]).lower() not in main_verbs) and\ not(str(doc[0]).lower() in main_verbs): return False else: return True
def verb_and_verb(txt): verbs_list = load_lists(fpath)['verbs'] doc = nlp(txt) result = "" for i in range(len(doc) + 2): if doc[i].pos_ == "VERB" and doc[i + 1].pos_ == "CCONJ" and doc[ i + 2].pos_ == "VERB": if doc[i].text in verbs_list and doc[i + 2].text in verbs_list: candidate = doc[i].text + " " + doc[i + 1].text + " " + doc[i + 2].text result += txt.replace(candidate, doc[i].text) + " " result += txt.replace(candidate, doc[i + 2].text) break return result
def removable_token( ): # When Virus:Win32/Funlove.4099 runs, it performs the following actions: removable_token_list = load_lists(fpath)['RTL'] removable_token_list = removable_token_list.replace( "'", "").strip('][').split(', ') lst = on_the_windows_x_only() for id, value in enumerate(lst): for j in removable_token_list: if value.strip().startswith( j ): #### definetly remember we should use only startswith()for proper matching # lst.remove(value) lst[id] = value.replace(j, " ") # break return lst
def following_subject(txt): following_subject_list = load_lists(fpath)['TFSL'] txt = txt.rstrip() txt = txt.rstrip('.') result = "" for sent in sent_tokenize(txt): for item in following_subject_list: if item in sent and ":" in sent: old_subj = item new_sub = sent.split(":", 1)[1] y = iocs.list_of_iocs(sent) if y: sentence_replicas = [new_sub[0]] * len(y) for i in range(len(sentence_replicas)): k = sent.split(":", 1)[0] l = k.replace(old_subj, " ") result += y[i] + l + " . " break return result
def findSVOs(tokens): svos = [] is_pas = _is_passive(tokens) verbs = [tok for tok in tokens if _is_non_aux_verb(tok)] # @kia if verbs == []: main_verbs = load_lists(fpath)['verbs'] main_verbs = main_verbs.replace("'", "").strip('][').split(', ') verbs = [tok for tok in tokens if str(tok) in main_verbs] visited = set() # recursion detection for v in verbs: subs, verbNegated = _get_all_subs(v) # hopefully there are subs, if not, don't examine this verb any longer if len(subs) > 0: isConjVerb, conjV = _right_of_verb_is_conj_verb(v) if isConjVerb: v2, objs = _get_all_objs(conjV, is_pas) for sub in subs: for obj in objs: objNegated = _is_negated(obj) if is_pas: # reverse object / subject for passive svos.append((to_str(expand(obj, tokens, visited)), "!" + v.lemma_ if verbNegated or objNegated else v.lemma_, to_str(expand(sub, tokens, visited)))) svos.append((to_str(expand(obj, tokens, visited)), "!" + v2.lemma_ if verbNegated or objNegated else v2.lemma_, to_str(expand(sub, tokens, visited)))) else: svos.append((to_str(expand(sub, tokens, visited)), "!" + v.lower_ if verbNegated or objNegated else v.lower_, to_str(expand(obj, tokens, visited)))) svos.append((to_str(expand(sub, tokens, visited)), "!" + v2.lower_ if verbNegated or objNegated else v2.lower_, to_str(expand(obj, tokens, visited)))) else: v, objs = _get_all_objs(v, is_pas) for sub in subs: for obj in objs: objNegated = _is_negated(obj) if is_pas: # reverse object / subject for passive svos.append((to_str(expand(obj, tokens, visited)), "!" + v.lemma_ if verbNegated or objNegated else v.lemma_, to_str(expand(sub, tokens, visited)))) else: svos.append((to_str(expand(sub, tokens, visited)), "!" + v.lower_ if verbNegated or objNegated else v.lower_, to_str(expand(obj, tokens, visited)))) return svos
def modification_(): final_txt = '' c = fl = 0 pattern = load_lists(fpath)['MDF'] pattern = pattern.replace("'", "").strip('][').split(', ') sentences = sent_tokenize(cc) for sentence in sentences: c += 1 for value in pattern: if value in sentence: sentence1 = sentence.split( value)[0] + ' modifies ' + sentence.split(value)[1] final_txt += " " + sentence1 + " " fl += 2 break if c > fl: final_txt += " " + sentence fl += 1 return final_txt.strip()
def communicate_to_sr(stri): final_txt = '' c = fl = 0 pattern = load_lists(fpath)['COMU'] pattern = pattern.replace("'", "").strip('][').split(', ') sentences = sent_tokenize(stri) for sentence in sentences: c += 1 for value in pattern: if value in sentence: sentence1 = sentence.split( value)[0] + ' receives from' + sentence.split(value)[1] sentence2 = sentence.split( value)[0] + ' sends to' + sentence.split(value)[1] final_txt += " " + sentence1 + " " + sentence2 fl += 2 c += 1 break if c > fl: final_txt += " " + sentence fl += 1 return final_txt
def ellipsis_subject(stri): ellipsis_verbs = load_lists(fpath)['verbs'] ellipsis_verbs = ellipsis_verbs.replace("'", "").strip('][').split(', ') sent_text = nltk.sent_tokenize(stri) result = "" for sentence in sent_text: token = nltk.word_tokenize(sentence) doc = nlp(sentence) if nltk.pos_tag(token)[0][1] == "VB" or nltk.pos_tag( token)[0][1] == "VBZ" or doc[0].pos_ == "VERB" or doc[ 0].text.lower() in ellipsis_verbs: new_sentence = " It " + nltk.pos_tag( token)[0][0].lower() + " " + " ".join(sentence.split(" ")[1:]) result += " " + new_sentence elif doc[0].dep_ == "ROOT": if doc[0].text.lower in ellipsis_verbs: new_sentence = " It " + doc[0].text.lower() + " " + " ".join( sentence.split(" ")[1:]) result += " " + new_sentence elif doc[0].text.lower() in ellipsis_verbs and doc[0].dep_ != "ROOT": result += " " + doc.text else: result += " " + sentence return result
def coref_the_following_middle(stri): final_txt = '' list2 = load_lists(fpath)['TFL'] list2 = list2.replace("'", "").strip('][').split(', ') sentences = sent_tokenize(stri) c = 0 fl = len(final_txt) for sentence in sentences: c += 1 for value in list2: if value in sentence: sentence.strip() if sentence[-1] == "." and ":" in sentence: sentence = sentence[:-1] sentence2 = sentence.split(":")[0].replace( value, sentence.split(":", 1)[1]) + ". " final_txt += ' ' + sentence2 fl += 1 break if c > fl: final_txt += ' ' + sentence fl += 1 return final_txt
import spacy from nltk import sent_tokenize nlp = spacy.load("en_core_web_lg") from lists_patterns import load_lists, fpath import main if not main.args.input_file: raise ValueError( "usage: main.py [-h] [--asterisk ASTERISK] [--crf CRF] [--rmdup RMDUP] [--gname GNAME] [--input_file INPUT_FILE]" ) else: f = open(main.args.input_file, encoding='iso-8859-1') txt = f.readlines() txt = " ".join(txt) txt = txt.replace('\n', ' ') titles_list = load_lists(fpath)['MS_TITLES'] titles_list = titles_list.replace("'", "").strip('][').split(', ') main_verbs = load_lists(fpath)['verbs'] main_verbs = main_verbs.replace("'", "").strip('][').split(', ') def delete_brackets(stri): stri = stri.replace("[", "") stri = stri.replace("]", "") stri = stri.replace("<", "") stri = stri.replace(">", "") return stri txt = delete_brackets(txt) txt = txt.strip(" ")
import re from list_iocs import iocs from allennlp.predictors.predictor import Predictor from lists_patterns import load_lists, fpath from nltk import sent_tokenize my_svo_triplet, all_nodes = [], [] main_verbs = load_lists(fpath)['verbs'] main_verbs = main_verbs.replace("'", "").strip('][').split(', ') sentences = r''' ''' # Abstractive/ To be added def ats(): for sentence in range(sent_tokenize(sentences)): predictor = Predictor.from_path("srl-model.tar.gz") predictions = predictor.predict(sentence) lst = [] nodes = [] for k in predictions['verbs']: if k['description'].count('[') > 1: lst.append(k['description']) for jj in range(len(lst)): nodes.append([]) for j in re.findall(r"[^[]*\[([^]]*)\]", lst[jj]): nodes[jj].append(j) print("*****sentence:", sentence, '*****nodes: ', nodes) for lis_ in nodes: for indx in range(len(lis_)): if lis_[0].split(":", 1)[0].lower().strip() == "v" and lis_[0].split(":", 1)[ 1].lower().strip() in main_verbs:
def CـC(txt): pattern = load_lists(fpath)['C_C'] pattern = pattern.replace("'", "").strip('][').split(', ') big_regex = re.compile('|'.join(map(re.escape, pattern)), re.IGNORECASE) sentence = big_regex.sub('remote ip:*', str(txt)) return sentence