예제 #1
0
    def ngram_mro(self):
        graph = NNCommonManager().get_nn_node_name(self.nn_id)
        for net in graph:
            if net['fields']['graph_node'] == 'netconf_data':
                data_node = net['fields']['graph_node_name']
            elif net['fields']['graph_node'] == 'netconf_node':
                net_node = net['fields']['graph_node_name']

        netconf = NNCommonManager().get_nn_node_info(self.nn_id, str(
            self.ver), net_node)[0]['fields']['node_config_data']

        self.param['list'] = []
        self.param['standard'] = float(netconf['standard'])
        self.param['datatype'] = netconf['datatype']
        self.param['conninfo'] = netconf['conninfo']

        if self.param['datatype'] == 'file':
            self.get_file_data(data_node)
        elif self.param['datatype'] == 'db':
            self.get_db_data()

        item = []
        for val in self.param['list']:
            try:
                item_tuple = (val['item_code'].strip(),
                              val['item_leaf'].strip(),
                              val['item_desc'].strip())
                item.append(item_tuple)
            except:
                logging.info('Error Data' + val['item_code'])

        dataset = ngram.NGram(item, key=lambda x: x[2])
        dataset = sorted(dataset, key=lambda x: x[0])
        findset = ngram.NGram(item, key=lambda x: x[2])

        logging.info(
            '================================================================================================'
        )
        return_data = {}
        for data in dataset:
            findset.remove(data)
            result = findset.search(data[2], self.param['standard'])

            for r in range(len(result)):
                if return_data.get(data[0]) == None:
                    return_data[data[0]] = {}
                    return_data[data[0]]['desc'] = data[2]
                    # logging.info(str(data[0]) + ':' + str(data[2]))
                return_data[data[0]][result[r][0][0]] = {
                    'item_desc': result[r][0][2],
                    'item_perc': result[r][1]
                }
                # logging.info(' - '+str(result[r][0][0])+'('+str(result[r][1])+')' + ':' + str(result[r][0][2]))
                logging.info(
                    str(data[0]) + '-' + str(result[r][0][0]) + '(' +
                    str(result[r][1]) + ')')

        return return_data
예제 #2
0
def main():
    words = get_words('E:/DataSet/NLP/中文分词语料(山西大学提供)/' +
                      '训练语料(528250词,Unicode格式).txt')
    seg_trian = get_seg_sentences('E:/DataSet/NLP/中文分词语料(山西大学提供)/' +
                                  '训练语料(528250词,Unicode格式).txt')
    seg_test = get_sentences('E:/DataSet/NLP/中文分词语料(山西大学提供)/' +
                             '测试语料(Unicode格式).txt')
    seg_test_answer = get_seg_sentences('E:/DataSet/NLP/中文分词语料(山西大学提供)/' +
                                        '测试语料答案(Unicode格式).txt')
    pos_train = get_pos_sentences(
        'E:/DataSet/NLP/人民日报语料199801/' + '199801.txt', 'ansi')
    pos_test = []

    # 词性标注里面的句子同时也是分词的训练集
    for data in pos_train:
        pos_test.append([dat[0] for dat in data])
        seg_trian.append([dat[0] for dat in data])
        for dat in data:
            words.add(dat[0])

    print('forward maximum match:')  # 正向最大匹配分词
    print_ratio(seg_test_answer, mm.fmm(words, seg_test))

    print('\nbackward maximum match:')  # 反向最大匹配分词
    print_ratio(seg_test_answer, mm.bmm(words, seg_test))

    print('\nshortest path segementation:')  # 最短路径分词
    print_ratio(seg_test_answer, sp.divide(words, seg_test))

    print('\n2-gram:')  # 2元文法分词以及词性标注
    gram2 = ngram.NGram(seg_trian, pos_train, n=2)
    print_ratio(seg_test_answer, gram2.seg(seg_test))
    print('pos correct ratio:\t',
          get_pos_correct_ratio(pos_train[:200], gram2.pos(pos_test[:300])))

    print('\n3-gram:')  # 3元文法分词以及词性标注
    gram3 = ngram.NGram(seg_trian, pos_train, n=3)
    print_ratio(seg_test_answer, gram3.seg(seg_test))
    print('pos correct ratio:\t',
          get_pos_correct_ratio(pos_train[:200], gram3.pos(pos_test[:300])))

    print('\n4-gram:')  # 4元文法分词以及词性标注
    gram4 = ngram.NGram(seg_trian, pos_train, n=4)
    print_ratio(seg_test_answer, gram4.seg(seg_test))
    print('pos correct ratio:\t',
          get_pos_correct_ratio(pos_train[:200], gram4.pos(pos_test[:300])))

    # 几个测试用例
    print('几个测试用例:')
    print(gram3.seg(['大连港年吞吐量超七千万吨', '今天同事问了我一道面试题']))
    print(
        gram3.pos([[
            '迈向', '充满', '希望', '的', '新', '世纪', '——', '一九九八年', '新年', '讲话', '(',
            '附', '图片', '1', '张', ')'
        ], ['希望', '是', '什么', '东西']]))
예제 #3
0
def learn():

    Dict = []
    num = 1000
    #In the future, a language detector can be used to reduce the amount of data load in memory
    Dict.append(ngram.NGram(readRealNames('english')))
    Dict.append(ngram.NGram(readForeignRealNames('spanish')))
    Dict.append(ngram.NGram(readSyntheticNames('english')))
    Dict.append(ngram.NGram(readForeignSyntheticNames('spanish')))
    #Dict.append(ngram.NGram(readForeignScrappedNames('english')))
    #Dict.append(ngram.NGram(readForeignScrappedNames('spanish')))

    return Dict
예제 #4
0
	def getNGrams(self, domain):

		uni_index= ngram.NGram(N=1)
		bi_index = ngram.NGram(N=2)
		tri_index = ngram.NGram(N=3)
		quad_index= ngram.NGram(N=4)

		unigrams= list(uni_index.ngrams(domain))
		bigrams= list(bi_index.ngrams(domain))
		trigrams= list(tri_index.ngrams(domain))
		quadgrams= list(quad_index.ngrams(domain))

		return unigrams, bigrams, trigrams, quadgrams
예제 #5
0
    def __init__(self, dictionary, N, match_threshold = 1.0):
        """ match threshold from 0.0 - 1.0, where higher value indicates closer n gram distance
            to be considered as a possible match
        """
        GenericAlgorithm.__init__(self, dictionary)
        #self.nGram = ngram.NGram([], key=lambda x:x.lower(), N=N)
        self.n = N
        self.nGramDict = {}
        self.threshold = match_threshold
        self.name = "NGram Distance"

        # save words in a dictionary of NGram lists based on length of the word
        for word in dictionary:
            key = str(len(word))

            if key in self.nGramDict.keys():
                self.nGramDict[key].add(word)

            else:
                self.nGramDict[key] = ngram.NGram(key=lambda x: x.lower(), N=self.n)
                self.nGramDict[key].add(word)

        self.match_threshold = match_threshold
        tFinish = int(time.time() * 1000)
        self.stats['runtime'] += tFinish - self.timeStart
예제 #6
0
def bleu(answer, references, theta = 0.0001):
    
    references_grams = []
    result = []
    for reff in references :
        G = ngram.NGram(tokenizer(reff))
        _, expp = M_bleu(answer, [G])
        result.append(expp)
        
    maxScore = max(result)
    maxIndex =  np.argmax(result)
    refMax = []
    for index, refs in enumerate(result) :
        if abs(refs - maxScore) < theta :
            refLength = len(tokenizer(references[index]))
            candLength = len(tokenizer(answer))
            refMax.append((refs, index, brevity_penalty(candLength, refLength)))
            
    # Find the selected reference answer 
    SRA_m_bleu = 0
    SRA_bp = 0  
    SRA_index = 0      
    for (m_bleu, index, bp) in refMax:
        if bp > SRA_bp :
            SRA_bp = bp
            SRA_index = index
            SRA_m_bleu = m_bleu 
    #print((SRA_m_bleu, SRA_bp, SRA_index))
    
    #return maxScore, maxIndex, refMax
    return SRA_m_bleu, SRA_bp, index
예제 #7
0
def match_levenshtein(token):
    dictSet = getDict()
    candidates = []
    candidatesG = []
    bestMatch = ""
    minDistance = 3

    for item in dictSet:
        distance = Levenshtein.distance(token.lower(), item.lower())
        if distance == 0:
            return item, [], []
        elif distance < minDistance:
            minDistance = distance
            candidates = []
        if distance == minDistance:
            candidates.append(item.lower())

    if len(candidates) > 1:
        G = ngram.NGram(candidates)
        candidatesG = G.search(token)
        if len(candidatesG) > 0:
            bestMatch = candidatesG[0][0]
    elif len(candidates) == 1:
        bestMatch = candidates[0]

    return bestMatch, candidates, candidatesG
예제 #8
0
def generate_file(file_name, output_file_name):
    sheet_input = open_excel(file_name=file_name)
    end_pos = 10000
    faq_data = dict()
    standard_question_idx = 0
    standard_question_to_id = dict()
    for i in range(2, end_pos):
        if sheet_input.cell(i, 1).value is None and sheet_input.cell(i, 2).value is None:
            break

        standard_question = replace_str(sheet_input.cell(i, 4).value)
        if standard_question is None or sheet_input.cell(i, 5).value is None:
            continue
        if standard_question not in standard_question_to_id:
            standard_question_to_id[standard_question] = standard_question_idx
            standard_question_idx += 1
        if standard_question not in faq_data:
            faq_data[standard_question] = []
        sim_questions = sheet_input.cell(i, 5).value.split('\n')
        sim_questions.append(standard_question)
        faq_data[standard_question].extend(
            list(set(
                map(replace_str, filter(lambda x: x != "", sim_questions)))))

    print(len(faq_data))
    faq_keys = dict()
    for i, (k, vs) in enumerate(faq_data.items()):
        for v in vs:
            faq_keys[v] = standard_question_to_id[k]
    k = 30
    total_groups = len(faq_data)
    training_data = []
    for i, key in enumerate(list(faq_data.keys())):
        if i % 1 == 0:
            print("{}/{}".format(i, total_groups))
        label = standard_question_to_id[key]
        questions = faq_data[key]
        corpus_list = []
        for items in filter(lambda x: x[0] != key, faq_data.items()):
            corpus_list.extend(items[1])
        if questions[0] in corpus_list:
            print("impossible")
        G = ngram.NGram(corpus_list, N=2)
        cache = dict()
        for positive_question, anchor_question in combinations(questions, 2):
            if anchor_question in cache:
                negative_questions = cache[anchor_question]
            else:
                negative_questions = G.search(anchor_question, threshold=0.1)
                cache[anchor_question] = negative_questions
            ans_len = min(len(negative_questions), 30)
            for i in range(ans_len):
                negative_question = negative_questions[i][0]
                training_data.append(
                    "{} {} {}\t{}\t{}\t{}".format(label, label, faq_keys[negative_question], positive_question,
                                                  anchor_question, negative_question))
    with open(output_file_name, "w", encoding="utf-8") as fw:
        for value in training_data:
            fw.write(value)
            fw.write("\r\n")
예제 #9
0
def query_search_test():
    print 'generate queries in train set...'
    sys.stdout.flush()

    train_queries = []
    with open(train_click_log, 'r') as f:
        for line in f:
            arr = line.strip().split('\t')
            query = arr[1].strip()
            train_queries.append(query)

    print 'generate queries in dev set...'
    sys.stdout.flush()

    dev_queries = []
    with open(dev_label, 'r') as f:
        for line in f:
            arr = line.strip().split('\t')
            query = arr[0].strip()
            dev_queries.append(query)

    print 'conduct a search...'
    sys.stdout.flush()

    G = ngram.NGram(train_queries)
    print G.NGram.search(dev_queries[0], threshold=0.3)
예제 #10
0
def partial_words_overlap(answer, thresh=0.7):
    import ngram
    # inspiration -> semsim : a multi-feature approach to semantic
    # text similarity Adebayo Guidi Boella
    # avg 24
    facit = "know amount vinegar container type sort cup material long time minute temperature experiment sample rinse distilled water size surface area drying method"
    len_answer = len(tokenizer(answer))
    answer = remove_shit(answer)
    answer = remove_stops(tokenizer(answer))

    answer = tokenizer(answer)
    #print("Clean answer : ", answer)
    facit = tokenizer(facit)

    len_answer_tokenized = len(answer)
    len_facit = len(facit)

    G = ngram.NGram(answer)
    candidate = 0
    cand = []
    sum = 0
    for word in facit:
        candidate = G.search(word, threshold=thresh)
        if len(candidate) > 0:
            cand.append(candidate)
            sum = sum + 1

    #return cand, sum, sum /(24 + len_facit),  sum /(len_answe_tokenized  + len_facit)
    return sum / (24 + len_facit), sum / (len_answer_tokenized + len_facit)
예제 #11
0
    def __init__(self, robot_code, version, interpreter, _nlu_data_path=None):
        self.interpreter = interpreter
        self.version = version
        self.robot_code = robot_code
        if not _nlu_data_path:
            nlu_data_path = get_nlu_data_path(robot_code, version)
        else:
            nlu_data_path = _nlu_data_path
        with open(nlu_data_path, "r") as f:
            raw_training_data = json.load(f)
        regx = raw_training_data['regex_features']
        examples = raw_training_data["rasa_nlu_data"]["common_examples"]
        intent = {}
        for example in examples:
            if example["intent"] not in intent:
                intent[example["intent"]] = [example["text"]]
            else:
                intent[example["intent"]].append(example["text"])
        self.intent = intent
        self.intent_matcher = {intent_id: ngram.NGram(
            examples) for intent_id, examples in self.intent.items()}

        self.regx = {key: [re.compile(item) for item in value]
                     for key, value in regx.items()}
        self.key_words = raw_training_data['key_words']
        self.intent_rules = raw_training_data['intent_rules']
        self.intent_id2name = raw_training_data.get("intent_id2name", {})
예제 #12
0
def corpus_based_sim(answer, floor=0.8):
    import ngram

    #print(read_corp())

    facit = "need know amount vinegar container type material time temperature experiment sample rinse distilled water size surface area drying method"

    answer = remove_shit(answer)
    answer = remove_stops(tokenizer(answer))
    answer = tokenizer(answer)
    G = ngram.NGram(answer)

    #print("Clean answer : ", answer)
    facit = tokenizer(facit)
    count = 0
    for word in facit:
        candidate = G.search(word, threshold=floor)
        if len(candidate) > 0:
            count = count + 1
        else:
            syn = read_corp(word)
            for s in syn:
                candidate = G.search(s, threshold=floor)
                if len(candidate) > 0:
                    count = count + 1
    #print("count = ", count)
    sim1 = count / max(len(facit), 24)
    sim2 = count / max(len(facit), len(answer))
    return sim1, sim2
예제 #13
0
def to_ngrams(n, string):
    '''
  Convert string to sequence of ngrams.
  '''
    # TODO: need a LOWERCASE preprocessing step?  Maybe in the Sample class?
    index = ngram.NGram(pad_len=(n - 1), N=n)
    ngrams = tuple(index.split(string))
    return ngrams
예제 #14
0
def ngramed_list(lst: list, n: int = 3) -> list:
    """
    listをNグラム化する.
    :param lst Nグラム化対象のリスト
    :param n N (デフォルトは N = 3)
    :return Nグラム化済みのリスト
    """
    index = ngram.NGram(N=n)
    return [term for term in index.ngrams(lst)]
예제 #15
0
def number_of_bigrams(n, r=2):
    '''
    retorna o numero de ngrams para o tamanho n
    :param n: tamanho do string
    :param r: numero de ngrams
    :return:
    '''
    index = ngram.NGram(N=r)
    return len(list(index.ngrams(index.pad(str('a' * n)))))
예제 #16
0
 def street_to_numbers_ngrams(self):
     """ Returns mapping from lower case street name to possible number's ngrams """
     str_to_addr_ngram = defaultdict(lambda: NOT_FOUND_)
     street_groups = self.addr_book.groupby(by=STR_LC, sort=False, group_keys=True)
     for street_name, group in street_groups:
         street_addresses = group[BUILD_LC] + '$' + group[APT_LC]
         st_ngr = ngram.NGram(street_addresses.tolist(), N=3)
         str_to_addr_ngram[street_name] = st_ngr
     return str_to_addr_ngram
예제 #17
0
def break_ties(misspell, matches):
    G = ngram.NGram(matches)
    filtered = groupby(G.search(misspell), lambda x: x[1])
    filtered_matches = []
    for score, group in islice(filtered, 0, 1):
        for word in group:
            filtered_matches.append(word[0])

    return sorted(filtered_matches)
예제 #18
0
파일: calcMH.py 프로젝트: thiagonobrega/BAP
def run(slicer, slice, first, mhperm, rowsize, encrypt_flag, bf_size,
        bigrams_flag):

    mhs = []

    for i in range(0, rowsize):
        mh = MinHash(num_perm=mhperm)
        mhs.append(mh)

    sdata = StringIO(slicer.read(slice))

    reader = csv.reader(sdata,
                        delimiter=',',
                        quotechar='"',
                        quoting=csv.QUOTE_ALL,
                        skipinitialspace=True)
    #reader = csv.reader(sdata,delimiter=',',quotechar='"',quoting=csv.QUOTE_NONNUMERIC, skipinitialspace=True)

    if (first):
        next(reader, None)

    dics = []
    for i in range(0, rowsize):
        dics.append({})

    for row in reader:
        row_size = len(row)
        if (rowsize == row_size):
            for column in range(0, row_size):
                mh = mhs[column]
                local_data = ''
                if encrypt_flag:
                    #TODO: arrumar filtros de bloom
                    if bigrams_flag:
                        bf = encrypt.encryptDataBigram(str(row[column]),
                                                       bf_size)
                    else:
                        bf = encrypt.encryptDataWords(str(row[column]),
                                                      bf_size)
                    local_data = str(bf)
                    pass
                else:
                    #rever isso possivel problem (nao utilizado)
                    import ngram
                    index = ngram.NGram(N=2)
                    bigrams_list = list(
                        index.ngrams(index.pad(str(row[column]))))
                    local_data = str(row[column])

                    if bigrams_flag:
                        for bigram in bigrams_list:
                            mh.update(bigram.encode('utf8'))

                mh.update(local_data.encode('utf8'))

    #return data_stats
    return mhs
예제 #19
0
def ngramed(lis):
    n = 3
    index = ngram.NGram(N=n)

    ngram_returm = []
    for term in index.ngrams(lis):
        ngram_returm.append(term)

    return ngram_returm
예제 #20
0
    def __init__(self):
        self.PlurarShapes = open('Dictionaries\\PlurarShape.txt',
                                 'r',
                                 encoding="utf8").read().split('\n')
        self.Shapes = open('Dictionaries\\ShapesTerms.txt',
                           'r',
                           encoding="utf8").read().split('\n')
        self.geo = open('Dictionaries\\GeometryTerms.txt',
                        'r',
                        encoding="utf8").read().split('\n')
        self.claim = open('Dictionaries\\ClaimTerms.txt', 'r',
                          encoding="utf8").read().split('\n')
        self.conclusion = open('Dictionaries\\ConclusionTerms.txt',
                               'r',
                               encoding="utf8").read().split('\n')
        self.structure = open('Dictionaries\\TaskStructure.txt',
                              'r',
                              encoding="utf8").read().split('\n')
        self.software = open('Dictionaries\\SoftwareUsage.txt',
                             'r',
                             encoding="utf8").read().split('\n')
        self.nonMath = open('Dictionaries\\NMDTerms.txt', 'r',
                            encoding="utf8").read().split('\n')
        self.tech = open('Dictionaries\\TechTerms.txt', 'r',
                         encoding="utf8").read().split('\n')
        self.negTech = open('Dictionaries\\NegativeTechTerms.txt',
                            'r',
                            encoding="utf8").read().split('\n')
        self.shorts = open('Dictionaries\\ShortTerms.txt',
                           'r',
                           encoding="utf8").read().split('\n')
        self.context = open('Dictionaries\\ContextTerms.txt',
                            'r',
                            encoding="utf8").read().split('\n')
        self.stopWords = open('Dictionaries\\StopWords.txt',
                              'r',
                              encoding="utf8").read().split('\n')

        self.tecSentences = open('simSentences\\tec.txt', 'r',
                                 encoding="utf8").read().split('\n')
        self.dsSentences = open('simSentences\\ds.txt', 'r',
                                encoding="utf8").read().split('\n')
        self.gTec = ngram.NGram(self.tecSentences)
        self.gDs = ngram.NGram(self.dsSentences)
예제 #21
0
def generate_ngram_model(listoffilenames, N):
    ngram_model = ngram.NGram(N=N)

    for filename in listoffilenames:
        with open(filename, "r") as text_file:
            for domain_name in text_file:
                second_level_domain = domain_name.split(".")[0]
                ngram_model.add(second_level_domain)

    return ngram_model
예제 #22
0
def motif(sequences,L_Left,L_UP):
	len_motifs_sup = {}
	for i in xrange(L_Left,L_UP+1):
		motif = Counter() 
		for x in sequences:
			G = ngram.NGram(N = i)
			motif.update(G.ngrams(x))
			len_motifs_sup[i] = motif
	#print len_motifs_sup
	return len_motifs_sup
예제 #23
0
 def top_k_similarity(self, input_text, candidates, k=1):
     #print(list(zip(range(len(candidates)), candidates)))
     ngram_index = ngram.NGram(zip(range(len(candidates)), candidates),
                               n=2,
                               key=lambda x: x[1])
     ret = ngram_index.search(input_text)
     ret_list = [(v[0], score) for v, score in ret[:k]]
     if len(ret_list) == 0:
         ret_list = [(0, 0)]
     return ret_list
예제 #24
0
def master_process(comm, misspellfilepath, correctfilepath, dictpath, size, start_time):
    #count how many words to be corrected
    count = 0
    print 'start'
    n = 2
    #send gram list of dictionary to slave_process
    with open(dictpath, 'r') as f:
        list = f.readlines()
        for i in range(len(list)):
            list[i] = list[i].strip()
        G = ngram.NGram(list, N=n)

    for i in range(size-1):
        comm.send(G, dest=(i % (size - 1) + 1), tag=(i % (size - 1) + 1))

    #send misspelled words and correct words to slave_process
    with open(misspellfilepath) as mf:
        with open(correctfilepath) as cf:
            for mline in mf:
                mline= mline.strip()
                cline = cf.readline().strip()
                count = count + 1
                comm.send(mline, dest=(count % (size - 1) + 1), tag=(count % (size - 1) + 1))
                comm.send(cline, dest=(count % (size - 1) + 1), tag=(count % (size - 1) + 1))
    print 'all words sent'
    
    #ask slave_process to exit
    for i in range(size-1):
        comm.send('exit', dest=(i % (size - 1) + 1), tag=(i % (size - 1) + 1))
        comm.send('exit', dest=(i % (size - 1) + 1), tag=(i % (size - 1) + 1))

    #used to receive data from slave_process
    correctcount = 0.0
    #if one single prediction is correct, count
    accuracycount = 0.0
    #if correct word is in the list of predictions, count
    recallcount = 0.0
    #count the number of predictions
    predictcount = 0.0
    
    for i in range(size-1):
        #receive result from slave_process
        correctcount = comm.recv(source=(i+1),tag = (i+1))
        accuracycount = accuracycount + correctcount[0]
        recallcount = recallcount + correctcount[1]
        predictcount = predictcount + correctcount[2]

    #print results
    print str(n)+'-gram on '+str(size)+' cores'
    print 'accuracy: '+str(accuracycount/count)
    print 'precision: '+str(recallcount/predictcount)
    print 'recall: '+str(recallcount/count)
    print 'predict: '+str(predictcount)
    end_time = time.time()
    print 'spend'+str(end_time-start_time)+'s'
예제 #25
0
def main():
    dictFp = open(sys.argv[1], 'r')
    inputFp = open(sys.argv[2], 'r')

    customN = int(sys.argv[3])
    threshold = float(sys.argv[4])

    d = []
    #print("Dictionary Loading")
    for line in dictFp:
        d.append(line.strip())
    D = ngram.NGram(d, N=customN)
    #print("Dictionary Loaded")

    outputDict = defaultdict(list)

    counter = 0
    for line in inputFp:
        print("#{}\t{}".format(counter, line), file=sys.stderr, end="")
        try:
            uid, tid, tweet, d = line.split('\t')
        except:
            continue
        words = tweet.split()

        for word in words:
            if (len(word) < 4):
                continue

            res = D.search(word, threshold=threshold)

            valid = True
            for found, prob in res:
                #if(found != word):
                #output.append(found)
                if (found == word):
                    valid = False
                    break

            if (len(res) and valid):  #and len(output)):
                res = ""
                #for w in output:
                #    res += (w + " ")
                #res += ("\t" + tid)
                print("> Identified {} as typo".format(word), file=sys.stderr)
                outputDict[word].append(tid)

        counter += 1

    for (k, v) in outputDict.items():
        print("{}\t".format(k), end="")
        for tid in v:
            print(tid, end=" ")
        print("")
예제 #26
0
def ssdeep_to_int_ngram(ssdeep_hash):
    G = ngram.NGram(N=7, pad_len=0)
    ssdeep_7grams = []

    ssdeep_parts = ssdeep_hash.split(":")

    for seven_gram in G.split(ssdeep_parts[1]):
        ssdeep_7grams.append(seven_gram_to_int(seven_gram))
    for seven_gram in G.split(ssdeep_parts[2]):
        ssdeep_7grams.append(seven_gram_to_int(seven_gram))

    return ssdeep_7grams
예제 #27
0
def apply_ngram(misspell, dictionary):
    G = ngram.NGram(dictionary)
    count = 0
    result = []

    for mis_word in misspell:
        if mis_word not in dictionary:
            if '/' not in mis_word:
                """search a list of approximate words using ngram search"""

                pred_words = []
                if G.search(mis_word, threshold=0.4):
                    search_result = G.search(mis_word, threshold=0.4)

                    try:
                        search_result[0][1]
                    except:
                        search_result = (mis_word, 1)
                    else:
                        highest_score = search_result[0][1]

                    for (w, s) in search_result:
                        if math.isclose(s, highest_score):
                            pred_words.append(w)

                    if len(pred_words) == 1:
                        result.append(pred_words[0])
                    else:
                        result.append(pred_words)

                else:
                    result.append(G.find(mis_word))

            else:
                multi_words = mis_word.split('/')
                tmp = ''
                for w in multi_words:
                    if w:
                        approx_w = G.find(w)
                        tmp += (approx_w + '/')
                    elif len(w) == 3:  # for w is like /i/
                        tmp = w
                    else:
                        continue
                tmp = tmp[:-1]
                result.append(tmp)
        else:
            result.append(mis_word)

        count += 1
        print("Processing: {} / {}".format(count, len(misspell)), end='\r')

    return result
예제 #28
0
def match_double_metaphone(token):
    dictSet = getDict()
    candidates = []
    candidatesG = []
    class1 = []
    class2 = []
    class3 = []
    hasClass1 = False
    hasClass2 = False
    bestMatch = ""

    dmeta = fuzzy.DMetaphone()
    dm_token = dmeta(token)
    dm_token_pk = dm_token[0]
    dm_token_sk = dm_token[1]

    for match in dictSet:
        dm_match = dmeta(match)
        dm_match_pk = dm_match[0]
        dm_match_sk = dm_match[1]

        if (dm_token_pk != 'None') and (dm_token_pk == dm_match_pk):
            hasClass1 = True
            class1.append(match)
            continue
        if (not hasClass1) and (
            (dm_token_pk != 'None' and dm_token_pk == dm_match_sk) or
            (dm_token_sk != 'None' and dm_token_sk == dm_match_pk)):
            hasClass2 = True
            class2.append(match)
            continue

        if (not hasClass2) and (dm_token_sk != 'None'
                                and dm_token_sk == dm_match_sk):
            class3.append(match)

    if hasClass1:
        candidates = class1
    elif hasClass2:
        candidates = class2
    else:
        candidates = class3

    if len(candidates) > 1:
        G = ngram.NGram(candidates)
        candidatesG = G.search(token)
        if len(candidatesG) > 0:
            bestMatch = candidatesG[0][0]
    elif len(candidates) == 1:
        bestMatch = candidates[0]

    return bestMatch, candidates, candidatesG
예제 #29
0
 def try_all(self, answer):
     compound = list()
     for f in self.facit:
         f = nltk.word_tokenize(f)
         G = ngram.NGram(f)
         fas = list()
         for a in answer:
             fas.extend(G.search(a))
         s = 0
         for word, weight in fas:
             s = s + weight
         compound.append(s)
     return compound
예제 #30
0
파일: 005.py 프로젝트: nagamine-404/nlp100
def myfunction(seq):
    if isinstance(seq, str):
        print("===文字bi-gram===")
        index = ngram.NGram(N=2)
        for term in index.ngrams(index.pad(seq)):
            print(term)
    elif isinstance(seq, list):
        print("===単語bi-gram===")
        for (word, count) in zip(seq, range(len(seq))):
            sec_word = ""
            if len(seq) >= count and count <= len(seq) - 2:
                sec_word = "-" + seq[count + 1]
            print(word + sec_word)