def extractKalimat(filesOfBerita, keyword, method): global source global datesResult global numsResult global sntcResult source.clear() datesResult.clear() numsResult.clear() sntcResult.clear() for berita in filesOfBerita: kalimatBerita = extractBerita("../test/"+berita) if (method == "kmp"): for kalimat in kalimatBerita : foundIndex = KMP.kmpAlgorithm(kalimat, keyword) if (foundIndex != -1): prosesKalimat(kalimat, berita, foundIndex) elif(method == "bm"): for kalimat in kalimatBerita : foundIndex = BM.bmAlgorithm(kalimat, keyword) if (foundIndex != -1): prosesKalimat(kalimat, berita, foundIndex) else : #method == "regex" for kalimat in kalimatBerita : foundIndex = KMP.kmpAlgorithm(kalimat, keyword) if (foundIndex != -1): prosesKalimat(kalimat, berita, foundIndex)
def matchWithMethod(method, text, pattern): if (method == 'bm'): return (BM.bmAlgorithm(text, pattern)) elif (method == 'kmp'): return (KMP.kmpAlgorithm(text, pattern)) else: return (regexp.regex(text, pattern))
def timeAnalysis(pat, txt): t1_avg = 0 t2_avg = 0 t3_avg = 0 for _ in range(10): t1 = time.process_time() nv.naiveMatching(pat, txt) t1 = time.process_time() - t1 t1_avg += t1 # print([pat, txt]) t2 = time.process_time() rk.rabin_karp_search(pat, txt) t2 = time.process_time() - t2 t2_avg += t2 # print([pat, txt]) t3 = time.process_time() kmp.KMPSearch(pat, txt) t3 = time.process_time() - t3 t3_avg += t3 # print([pat, txt]) t1_avg /= 100 t2_avg /= 100 t3_avg /= 100 print("Naive:" + str(t1_avg)) print("Rabin:" + str(t2_avg)) print("Knuth:" + str(t3_avg))
def test_overlap(self): """ Test the overlap function with an easy case """ seq1 = "abdc sdf " seq2 = "sdf sabd" seq3 = "cc" n_overlaps = KMP.characters_overlapping(seq1, seq2) self.assertEqual(n_overlaps, 4) ov = seq2[0:n_overlaps] self.assertEqual(ov, "sdf ") n_overlaps = KMP.characters_overlapping(seq2, seq1) self.assertEqual(n_overlaps, 3) ov = seq1[0:n_overlaps] self.assertEqual(ov, "abd") n_overlaps = KMP.characters_overlapping(seq2, seq3) self.assertEqual(n_overlaps, 0)
def create_1000_case_test(): li = [] bc = BertClient(ip='222.19.197.230', port=5555, port_out=5556, check_version=False) test_text = pre_deal.get_test_textVector() zero_vector = np.zeros((500, 768)) for i in range(0, len(test_text)): x = tokenize.word_tokenize(test_text[i]) if (len(x) >502): index = KMP.KMP_algorithm(test_text[i], x[500] + " " + x[501]) if (index != -1): list = [] sentence_1 = test_text[i][0:index] sentence_2 = test_text[i][index:] list.append(sentence_1) list.append(sentence_2) vector = bc.encode(list) ve = np.concatenate((vector[0], vector[1]), axis=0) li.append(ve.tolist()) else: list = [] list.append(test_text[i]) vector = bc.encode(list) ve = np.concatenate((vector[0], zero_vector), axis=0) li.append(ve.tolist()) else: list = [] list.append(test_text[i]) vector = bc.encode(list) ve = np.concatenate((vector[0], zero_vector), axis=0) li.append(ve.tolist()) li_vector = np.array(li) np.save("test_case_1000.npy", li_vector)
def SpamDetector(tweet, keywords, chosenAlgo): if (chosenAlgo == 1): tweet['IS_SPAM'] = KMP.KMPMatching(tweet["full_text"], keywords) != -1 elif (chosenAlgo == 2): tweet['IS_SPAM'] = BM.BMMatching(tweet["full_text"], keywords) != -1 else: tweet['IS_SPAM'] = RE.regex(tweet["full_text"], keywords) != None return tweet
def test_find_overlap(self): """ Test the find/overlap function with an easy case """ seq1 = "abdc sdf" seq2 = "dfke " seq3 = "dfs abd" seq4 = "dc s" seq5 = "pp" position, n_overlaps = KMP.find_or_overlap(seq1, seq2) self.assertEqual(n_overlaps, 2) self.assertEqual(position, 6) position, n_overlaps = KMP.find_or_overlap(seq3, seq1) self.assertEqual(n_overlaps, 3) self.assertEqual(position, 4) position, n_overlaps = KMP.find_or_overlap(seq1, seq4) self.assertEqual(n_overlaps, 4) self.assertEqual(position, 2) position, n_overlaps = KMP.find_or_overlap(seq1, seq5) self.assertEqual(n_overlaps, 0) self.assertEqual(position, len(seq1))
def search(files, keyword): result = [] for i in files: file = open(i, "r") result.append({"file": i, "num": KMP.count(file.read(), keyword)}) file.close() def num(result): return result["num"] result.sort(key=num, reverse=True) return result
def doKMP(): data = request.form['hasil'] data = json.JSONDecoder().decode(data) pattern = KMP(data['spam_indicator']) search_type = data['search_type'] tweets = dict() if (search_type == "1"): username = data['username'] count = data['count'] tweets = api.search_timeline(username, count) else: region = data['region'] count = data['count'] tweets = api.search_region(region, count) is_spam = [] for tweet in tweets['full_text']: is_spam.append(pattern.is_match(tweet)) tweets['is_spam'] = is_spam return json.dumps(tweets)
def cal_words_positions(files): result = {} for i in files: file = open(i, "r") content = file.read() for word in content.split(" "): if word not in result.keys(): result[word] = KMP.positions(content, word) file.close() result2 = [] for key in result.keys(): result2.append({"word": key, "pos": result[key]}) return result2
def cal_words_freq(files, reverse=True): result = {} for i in files: file = open(i, "r") content = file.read() for word in content.split(" "): if word.lower() not in result.keys(): result[word.lower] = KMP.count(content, word) file.close() result2 = [] for key in result.keys(): result2.append({"word": key, "num": result[key]}) def num(result): return result["num"] result2.sort(key=num, reverse=reverse) return result2
def compute_overlaps(fragments_dict, mat): """ Compute the overlaps between the fragments and store them in a MatchManager @param fragments_dict A dictionary of fragments with their ID as key @param mat A MatchManager """ log.debug("Computing overlaps") # sort the keys in descending order. A greater key will correspond (roughly) to # a longer fragment. This condition is only guaranteed during the first iteration keys = sorted(fragments_dict.keys()) keys.reverse() for i, j in itertools.product(keys, keys): if i == j: continue if not mat.needs_calculation(i, j, len(fragments_dict[i]), len(fragments_dict[j])): continue position, n_chars = KMP.find_or_overlap(fragments_dict[i], fragments_dict[j]) log.debug("Overlap between sequences %s (left) and %s (right): %s",i, j, n_chars) m = Match(i,j, position, n_chars) mat.store(m)
def word_result(freqs, word): result = [] for word_tuple in freqs: temp = {"file": word_tuple[0], "num": 0, "pos": []} if " " not in word: for word_and_freq_dict in word_tuple[1]: if word_and_freq_dict["word"] == word: temp["num"] = len(word_and_freq_dict["pos"]) temp["pos"] = word_and_freq_dict["pos"] else: pos = KMP.positions( File.File(word_tuple[0]).get_content(), word) temp["num"] = len(pos) temp["pos"] = pos result.append(temp) def num(result): return result["num"] result.sort(key=num, reverse=True) return result
fileName = 'book.txt' #reader.readFile("Paragraphs/para0.txt") #reader.readFile("Paragraphs/para1.txt") #reader.readFile("Paragraphs/para2.txt") #"of" #pattern = reader.readFile("Paragraphs/para2.txt") pattern = "his" # Get content's file s = reader.readFile(fileName) #print(" ==> Text :: %s" %s) print(" ==> Pattern :: %s" %pattern) print(" ==> Pattern Length :: %d" %len(pattern)) #Bruteforce Algorithm Bruteforce.bruteForce(s, pattern) #Sunday Algorithm Sunday.sunday(s, pattern) #KMP Algorithm KMP.KMP(s, pattern) #FSM Algorithm FSM.FSM(s, pattern) #Rabin-Karp Algorithm RabinKarp.rabinKarp(s, pattern)
import BF import KMP if __name__ == "__main__": file = open("test.txt") try: text_string = file.read() finally: file.close() pat_string = "bacbababadababacambabacaddababacasdsd" print(KMP.kmp_search(text_string, pat_string)) print("*" * 50) print(BF.bf_search(text_string, pat_string))
def match(word, algo, filename): sentL = ConvertText(filename) word = word.lower() awalan = re.split('\n', sentL[0]) judul = awalan[0] waktufile = awalan[2] del sentL[0] sentL.insert(0, awalan[4]) sentL.insert(0, judul) # print(awalan[4]) hasil = [] if (algo == "Regex"): for s in sentL: # print(s) t = s awal = REexact(word, s) res = [] if (awal != -1): waktu = REWaktu(s) if (waktu == "None"): waktu = waktufile else: s = re.sub(waktu, "", s) jumlah = REjumlah(awal, word, s) res.append("Jumlah :" + jumlah) res.append("Waktu :" + waktu) res.append(t + "(" + filename + ")") hasil.append(res) return (hasil) elif (algo == "KMP"): for s in sentL: # print(s) t = s awal = KMP(s, word) res = [] if (awal != -1): waktu = REWaktu(s) if (waktu == "None"): waktu = waktufile else: s = re.sub(waktu, "", s) jumlah = REjumlah(awal, word, s) res.append("Jumlah :" + jumlah) res.append("Waktu :" + waktu) res.append(t + "(" + filename + ")") hasil.append(res) return (hasil) elif (algo == "BM"): for s in sentL: # print(s) t = s awal = (BM(s, word)) res = [] if (awal != -1): waktu = REWaktu(s) if (waktu == "None"): waktu = waktufile else: s = re.sub(waktu, "", s) jumlah = REjumlah(awal, word, s) res.append("Jumlah :" + jumlah) res.append("Waktu :" + waktu) res.append(t + "(" + filename + ")") hasil.append(res) return (hasil)
from nltk import tokenize import KMP from bert_serving.client import BertClient import numpy as np li = [] bc = BertClient(ip='222.19.197.230', port=5555, port_out=5556, check_version=False) labels_vector_dict, test_text = pre_deal.get_labels_vector() zero_vector = np.zeros((500, 768)) for i in range(0, len(test_text)): x = tokenize.word_tokenize(test_text[i]) if (len(x) > 502): index = KMP.KMP_algorithm(test_text[i], x[500] + " " + x[501]) if (index != -1): list = [] sentence_1 = test_text[i][0:index] sentence_2 = test_text[i][index:] list.append(sentence_1) list.append(sentence_2) vector = bc.encode(list) ve = np.concatenate((vector[0], vector[1]), axis=0) li.append(ve.tolist()) else: list = [] list.append(test_text[i]) vector = bc.encode(list) ve = np.concatenate((vector[0], zero_vector), axis=0) li.append(ve.tolist())
def get_apperance_num(self, word): return KMP.count(self.get_content(), word)
labels_tag[list_labels[j][7:16]].append(min(l1)) labels_tag[list_labels[j][7:16]].append(max(l1)) print("----------------------") if (min(l1) == max(l1) and min(l1) > 400): li = texts_token[j][min(l1):] print(texts_token[j][min(l1):]) else: li = texts_token[j][min(l1):max(l1)] print(texts_token[j][min(l1):max(l1)]) list2 = [str(i) for i in li] # 使用列表推导式把列表中的单个元素全部转化为str类型 list3 = ' '.join(list2) # 把列表中的元素放在空串中,元素间用空格隔开 if (list3 == ''): pass else: a = KMP.KMP_algorithm(test_text[j], list3) if (a == -1): list_gai = str(texts_token[j][min(l1)]) a = KMP.KMP_algorithm(test_text[j], list_gai) # 开始位置 print(list_labels[j][7:16]) print("值为:" + str(a)) b = a + len(list3) print("结束值为:" + str(b)) # str_1 = random.choice(list_tc) if (a == -1): if (b > 100): f.write(list_labels[j][7:16] + '\t' + str(b - 100) + '\t' + str(b) + '\n') else: f.write(list_labels[j][7:16] + '\t' + str(0) + '\t' + str(b) + '\n')