def extractKalimat(filesOfBerita, keyword, method):
    global source
    global datesResult
    global numsResult
    global sntcResult

    source.clear()
    datesResult.clear()
    numsResult.clear()
    sntcResult.clear()

    for berita in filesOfBerita:
        kalimatBerita = extractBerita("../test/"+berita)

        if (method == "kmp"):
            for kalimat in kalimatBerita :
                foundIndex = KMP.kmpAlgorithm(kalimat, keyword)
                if (foundIndex != -1):
                    prosesKalimat(kalimat, berita, foundIndex)

        elif(method == "bm"):
            for kalimat in kalimatBerita :
                foundIndex = BM.bmAlgorithm(kalimat, keyword)
                if (foundIndex != -1):
                    prosesKalimat(kalimat, berita, foundIndex)
        else : #method == "regex"
            for kalimat in kalimatBerita :
                foundIndex = KMP.kmpAlgorithm(kalimat, keyword)
                if (foundIndex != -1):
                    prosesKalimat(kalimat, berita, foundIndex)
def matchWithMethod(method, text, pattern):
    if (method == 'bm'):
        return (BM.bmAlgorithm(text, pattern))
    elif (method == 'kmp'):
        return (KMP.kmpAlgorithm(text, pattern))
    else:
        return (regexp.regex(text, pattern))
Exemplo n.º 3
0
def timeAnalysis(pat, txt):
    t1_avg = 0
    t2_avg = 0
    t3_avg = 0
    for _ in range(10):
        t1 = time.process_time()
        nv.naiveMatching(pat, txt)
        t1 = time.process_time() - t1
        t1_avg += t1
        # print([pat, txt])

        t2 = time.process_time()
        rk.rabin_karp_search(pat, txt)
        t2 = time.process_time() - t2
        t2_avg += t2
        # print([pat, txt])

        t3 = time.process_time()
        kmp.KMPSearch(pat, txt)
        t3 = time.process_time() - t3
        t3_avg += t3
        # print([pat, txt])

    t1_avg /= 100
    t2_avg /= 100
    t3_avg /= 100

    print("Naive:" + str(t1_avg))
    print("Rabin:" + str(t2_avg))
    print("Knuth:" + str(t3_avg))
Exemplo n.º 4
0
 def test_overlap(self):
     """ Test the overlap function with an easy case
     """
     seq1 = "abdc sdf "
     seq2 = "sdf sabd"
     seq3 = "cc"
     n_overlaps = KMP.characters_overlapping(seq1, seq2)
     self.assertEqual(n_overlaps, 4)
     ov = seq2[0:n_overlaps]
     self.assertEqual(ov, "sdf ")
     n_overlaps = KMP.characters_overlapping(seq2, seq1)
     self.assertEqual(n_overlaps, 3)
     ov = seq1[0:n_overlaps]
     self.assertEqual(ov, "abd")
     n_overlaps = KMP.characters_overlapping(seq2, seq3)
     self.assertEqual(n_overlaps, 0)
Exemplo n.º 5
0
def create_1000_case_test():
    li = []
    bc = BertClient(ip='222.19.197.230', port=5555, port_out=5556, check_version=False)
    test_text = pre_deal.get_test_textVector()
    zero_vector = np.zeros((500, 768))
    for i in range(0, len(test_text)):
    x = tokenize.word_tokenize(test_text[i])
    if (len(x) >502):
        index = KMP.KMP_algorithm(test_text[i], x[500] + " " + x[501])
        if (index != -1):
            list = []
            sentence_1 = test_text[i][0:index]
            sentence_2 = test_text[i][index:]
            list.append(sentence_1)
            list.append(sentence_2)
            vector = bc.encode(list)
            ve = np.concatenate((vector[0], vector[1]), axis=0)
            li.append(ve.tolist())
        else:
            list = []
            list.append(test_text[i])
            vector = bc.encode(list)
            ve = np.concatenate((vector[0], zero_vector), axis=0)
            li.append(ve.tolist())
    else:
        list = []
        list.append(test_text[i])
        vector = bc.encode(list)
        ve = np.concatenate((vector[0], zero_vector), axis=0)
        li.append(ve.tolist())
    li_vector = np.array(li)
    np.save("test_case_1000.npy", li_vector)
Exemplo n.º 6
0
def SpamDetector(tweet, keywords, chosenAlgo):
    if (chosenAlgo == 1):
        tweet['IS_SPAM'] = KMP.KMPMatching(tweet["full_text"], keywords) != -1
    elif (chosenAlgo == 2):
        tweet['IS_SPAM'] = BM.BMMatching(tweet["full_text"], keywords) != -1
    else:
        tweet['IS_SPAM'] = RE.regex(tweet["full_text"], keywords) != None
    return tweet
Exemplo n.º 7
0
 def test_find_overlap(self):
     """ Test the find/overlap function with an easy case
     """
     seq1 = "abdc sdf"
     seq2 = "dfke "
     seq3 = "dfs abd"
     seq4 = "dc s"
     seq5 = "pp"
     position, n_overlaps  = KMP.find_or_overlap(seq1, seq2)
     self.assertEqual(n_overlaps, 2)
     self.assertEqual(position, 6)
     position, n_overlaps = KMP.find_or_overlap(seq3, seq1)
     self.assertEqual(n_overlaps, 3)
     self.assertEqual(position, 4)
     position, n_overlaps = KMP.find_or_overlap(seq1, seq4)
     self.assertEqual(n_overlaps, 4)
     self.assertEqual(position, 2)
     position, n_overlaps = KMP.find_or_overlap(seq1, seq5)
     self.assertEqual(n_overlaps, 0)
     self.assertEqual(position, len(seq1))
Exemplo n.º 8
0
def search(files, keyword):
    result = []
    for i in files:
        file = open(i, "r")
        result.append({"file": i, "num": KMP.count(file.read(), keyword)})
        file.close()

    def num(result):
        return result["num"]

    result.sort(key=num, reverse=True)
    return result
Exemplo n.º 9
0
def doKMP():
    data = request.form['hasil']
    data = json.JSONDecoder().decode(data)
    pattern = KMP(data['spam_indicator'])
    search_type = data['search_type']
    tweets = dict()
    if (search_type == "1"):
        username = data['username']
        count = data['count']
        tweets = api.search_timeline(username, count)
    else:
        region = data['region']
        count = data['count']
        tweets = api.search_region(region, count)

    is_spam = []
    for tweet in tweets['full_text']:
        is_spam.append(pattern.is_match(tweet))

    tweets['is_spam'] = is_spam

    return json.dumps(tweets)
Exemplo n.º 10
0
def cal_words_positions(files):
    result = {}
    for i in files:
        file = open(i, "r")
        content = file.read()
        for word in content.split(" "):
            if word not in result.keys():
                result[word] = KMP.positions(content, word)
        file.close()
    result2 = []
    for key in result.keys():
        result2.append({"word": key, "pos": result[key]})
    return result2
Exemplo n.º 11
0
def cal_words_freq(files, reverse=True):
    result = {}
    for i in files:
        file = open(i, "r")
        content = file.read()
        for word in content.split(" "):
            if word.lower() not in result.keys():
                result[word.lower] = KMP.count(content, word)
        file.close()
    result2 = []
    for key in result.keys():
        result2.append({"word": key, "num": result[key]})

    def num(result):
        return result["num"]

    result2.sort(key=num, reverse=reverse)
    return result2
Exemplo n.º 12
0
def compute_overlaps(fragments_dict, mat):
    """ Compute the overlaps between the fragments and store them in a MatchManager
        @param fragments_dict A dictionary of fragments with their ID as key
        @param mat A MatchManager
    """
    log.debug("Computing overlaps")
    # sort the keys in descending order. A greater key will correspond (roughly) to
    # a longer fragment. This condition is only guaranteed during the first iteration
    keys = sorted(fragments_dict.keys())
    keys.reverse()
    for i, j in itertools.product(keys, keys):
        if i == j:
            continue
        if not mat.needs_calculation(i, j, len(fragments_dict[i]), len(fragments_dict[j])):
            continue
        position, n_chars = KMP.find_or_overlap(fragments_dict[i], fragments_dict[j])
        log.debug("Overlap between sequences %s (left) and %s (right): %s",i, j, n_chars)
        m = Match(i,j, position, n_chars)
        mat.store(m)
Exemplo n.º 13
0
        def word_result(freqs, word):
            result = []
            for word_tuple in freqs:
                temp = {"file": word_tuple[0], "num": 0, "pos": []}
                if " " not in word:
                    for word_and_freq_dict in word_tuple[1]:
                        if word_and_freq_dict["word"] == word:
                            temp["num"] = len(word_and_freq_dict["pos"])
                            temp["pos"] = word_and_freq_dict["pos"]
                else:
                    pos = KMP.positions(
                        File.File(word_tuple[0]).get_content(), word)
                    temp["num"] = len(pos)
                    temp["pos"] = pos
                result.append(temp)

            def num(result):
                return result["num"]

            result.sort(key=num, reverse=True)
            return result
Exemplo n.º 14
0
fileName = 'book.txt'

#reader.readFile("Paragraphs/para0.txt")
#reader.readFile("Paragraphs/para1.txt") 
#reader.readFile("Paragraphs/para2.txt") 
#"of"
#pattern = reader.readFile("Paragraphs/para2.txt")
pattern = "his"
# Get content's file
s = reader.readFile(fileName)

#print("	==> Text :: %s" %s)
print("	==> Pattern :: %s" %pattern)
print("	==> Pattern Length :: %d" %len(pattern))

#Bruteforce Algorithm
Bruteforce.bruteForce(s, pattern)

#Sunday Algorithm
Sunday.sunday(s, pattern)

#KMP Algorithm
KMP.KMP(s, pattern)

#FSM Algorithm
FSM.FSM(s, pattern)

#Rabin-Karp Algorithm
RabinKarp.rabinKarp(s, pattern)
Exemplo n.º 15
0
import BF
import KMP

if __name__ == "__main__":
    file = open("test.txt")
    try:
        text_string = file.read()
    finally:
        file.close()
    pat_string = "bacbababadababacambabacaddababacasdsd"

    print(KMP.kmp_search(text_string, pat_string))

    print("*" * 50)

    print(BF.bf_search(text_string, pat_string))
Exemplo n.º 16
0
def match(word, algo, filename):
    sentL = ConvertText(filename)
    word = word.lower()
    awalan = re.split('\n', sentL[0])
    judul = awalan[0]
    waktufile = awalan[2]
    del sentL[0]
    sentL.insert(0, awalan[4])
    sentL.insert(0, judul)
    # print(awalan[4])
    hasil = []
    if (algo == "Regex"):
        for s in sentL:
            # print(s)
            t = s
            awal = REexact(word, s)
            res = []
            if (awal != -1):
                waktu = REWaktu(s)
                if (waktu == "None"):
                    waktu = waktufile
                else:
                    s = re.sub(waktu, "", s)
                jumlah = REjumlah(awal, word, s)

                res.append("Jumlah    :" + jumlah)
                res.append("Waktu     :" + waktu)
                res.append(t + "(" + filename + ")")
                hasil.append(res)
        return (hasil)
    elif (algo == "KMP"):
        for s in sentL:
            # print(s)
            t = s
            awal = KMP(s, word)
            res = []
            if (awal != -1):
                waktu = REWaktu(s)
                if (waktu == "None"):
                    waktu = waktufile
                else:
                    s = re.sub(waktu, "", s)
                jumlah = REjumlah(awal, word, s)

                res.append("Jumlah    :" + jumlah)
                res.append("Waktu     :" + waktu)
                res.append(t + "(" + filename + ")")
                hasil.append(res)
        return (hasil)
    elif (algo == "BM"):
        for s in sentL:
            # print(s)
            t = s
            awal = (BM(s, word))
            res = []
            if (awal != -1):
                waktu = REWaktu(s)
                if (waktu == "None"):
                    waktu = waktufile
                else:
                    s = re.sub(waktu, "", s)
                jumlah = REjumlah(awal, word, s)

                res.append("Jumlah    :" + jumlah)
                res.append("Waktu     :" + waktu)
                res.append(t + "(" + filename + ")")
                hasil.append(res)
        return (hasil)
Exemplo n.º 17
0
from nltk import tokenize
import KMP
from bert_serving.client import BertClient
import numpy as np

li = []
bc = BertClient(ip='222.19.197.230',
                port=5555,
                port_out=5556,
                check_version=False)
labels_vector_dict, test_text = pre_deal.get_labels_vector()
zero_vector = np.zeros((500, 768))
for i in range(0, len(test_text)):
    x = tokenize.word_tokenize(test_text[i])
    if (len(x) > 502):
        index = KMP.KMP_algorithm(test_text[i], x[500] + " " + x[501])
        if (index != -1):
            list = []
            sentence_1 = test_text[i][0:index]
            sentence_2 = test_text[i][index:]
            list.append(sentence_1)
            list.append(sentence_2)
            vector = bc.encode(list)
            ve = np.concatenate((vector[0], vector[1]), axis=0)
            li.append(ve.tolist())
        else:
            list = []
            list.append(test_text[i])
            vector = bc.encode(list)
            ve = np.concatenate((vector[0], zero_vector), axis=0)
            li.append(ve.tolist())
Exemplo n.º 18
0
 def get_apperance_num(self, word):
     return KMP.count(self.get_content(), word)
Exemplo n.º 19
0
        labels_tag[list_labels[j][7:16]].append(min(l1))
        labels_tag[list_labels[j][7:16]].append(max(l1))
        print("----------------------")
        if (min(l1) == max(l1) and min(l1) > 400):
            li = texts_token[j][min(l1):]
            print(texts_token[j][min(l1):])
        else:
            li = texts_token[j][min(l1):max(l1)]
            print(texts_token[j][min(l1):max(l1)])

        list2 = [str(i) for i in li]  # 使用列表推导式把列表中的单个元素全部转化为str类型
        list3 = ' '.join(list2)  # 把列表中的元素放在空串中,元素间用空格隔开
        if (list3 == ''):
            pass
        else:
            a = KMP.KMP_algorithm(test_text[j], list3)
            if (a == -1):
                list_gai = str(texts_token[j][min(l1)])
                a = KMP.KMP_algorithm(test_text[j], list_gai)  # 开始位置
                print(list_labels[j][7:16])
                print("值为:" + str(a))
                b = a + len(list3)
                print("结束值为:" + str(b))
                # str_1 = random.choice(list_tc)
                if (a == -1):
                    if (b > 100):
                        f.write(list_labels[j][7:16] + '\t' + str(b - 100) +
                                '\t' + str(b) + '\n')
                    else:
                        f.write(list_labels[j][7:16] + '\t' + str(0) + '\t' +
                                str(b) + '\n')