def calculateSimhashes(): """Calculate simhash values for number of textfiles.""" word_counts = dict() hashes = dict() files = os.listdir('./text') counter = 0 for file in files: counter += 1 if counter % 1000 == 0: print counter with open('./text/' + file, 'r') as f: lines = f.readlines() words = [] for line in lines: for word in re.split('\W+', line): if word != '': newWord = word.lower() words.append(newWord) hashes[file] = simhash(words) word_counts[file] = len(words) with open("word_counts.txt", 'w+') as f: for (file, count) in word_counts.items(): f.write(file + '\t' + str(count) + '\n') with open("hashes.txt", 'w+') as f: for (file, hash) in hashes.items(): f.write(file + '\t' + str(hash) + '\n')
def predeal(pos, next): for i in range(pos, next): print(i) # mon=0 no = ws.cell(row=i, column=1).value cont = ws.cell(row=i, column=2).value money = ws.cell(row=i, column=5).value type1 = ws.cell(row=i, column=3).value type2 = ws.cell(row=i, column=4).value # print(money) if re.search('万', money): mon = delete(money) mon = float(mon) * 10000 elif re.search('千', money): mon = delete(money) mon = float(mon) * 1000 else: mon = delete(money) mon = float(mon) # try: # mon = float(mon) # except ValueError: # print(mon) money = mon hashcode = simhash.simhash(cont, content_list, type=True) if hashcode.__str__() == '00': continue keywords = '' for tu in hashcode.tf_idf[:21]: keywords += str(tu[0]).replace('\'', '') + ' ' ws.cell(row=i, column=7, value=keywords) print(keywords) file2.write(no + ' ' + type1 + ' ' + type2 + ' ' + str(money) + ' ' + keywords + '\n')
def similarity(name, content): ''' :param name: 被告人姓名 :param content:庭审过程 :return: ''' file = [] predeal = source_file_stan() file.append(predeal) data = predeal.readlines() content_list = get_idf_content() result, money = split.split_text(name, content) ca = simhash.simhash(result, content_list) print(ca.tf_idf) simi = {} for has in data: has = has.strip('\n') da = has.split(' ') if len(da[4:]) < 10: continue simila = ca.matrix_dis(da[4:len(da) - 1]) simi[da[0]] = simila simi = sorted(simi.items(), key=lambda x: x[1], reverse=True) num = 1 print('与该案件相似判决书有:') for tu in simi: if num > 10: break print(tu) num += 1 source_close(file)
def test(): file = [] cases, predeal = source_file() file.append(cases) file.append(predeal) ws = cases.worksheets[1] data = predeal.readlines() content_list = get_idf_content() candidate = [22, 23, 30, 36, 37, 44, 45, 46, 48, 714, 2, 185, 315, 71, 80] name = ws.cell(row=15, column=10).value # 获取被告人姓名 value = ws.cell(row=15, column=22).value # 获取庭审过程 result = split.split_text(name, value) ca = simhash.simhash( result, content_list, type=True, ) sim = {} for has in data: has = has.strip('\n') da = has.split(' ') if int(da[0]) not in candidate: continue simila = ca.matrix_dis(da[4:len(da) - 1]) sim[da[0]] = simila sim = sorted(sim.items(), key=lambda x: x[1], reverse=True) print('与第%d号判决书相似的案件有:' % 15) for tu in sim: print(tu) source_close(file)
def on_request(ch, method, props, body): params = simplejson.loads(body) filedesc, conf = params LOG.info('({}) Processing {}'.format(PID, filedesc[0])) sh = simhash.simhash(filedesc[1], k=conf['k'], lenhash=conf['lenhash'], stopwords=conf['stopwords']) response = {'name': filedesc[0], 'sh': sh} send_back(ch, method, props, body, response)
def precess(money_dis=False, number=500): file = [] cases, predeal = source_file() file.append(cases) file.append(predeal) ws = cases.worksheets[1] data = predeal.readlines() count = 1 content_list = get_idf_content() for j in range(40, 100): if count > 4: break name = ws.cell(row=j, column=10).value # 获取被告人姓名 value = ws.cell(row=j, column=22).value # 获取庭审过程 if len(name.split('、')) > 1: # 处理只有一名被告人的文本 continue result = split.split_text(name, value) ca = simhash.simhash(result, content_list, type=True) if len(ca.__str__()) < 64: continue simi = {} if money_dis: money = project1.distinguish(value) money = money_convert(money) sim_case = compare_money(float(money), data, number) for has in sim_case: # start = time.time() simi[has[0]] = ca.matrix_dis(has[2]) # print("total compare time ",time.time()-start) else: for has in data: has = has.strip('\n') da = has.split(' ') if len(da[-1]) < 64: continue # if len(da[4:]) < 10: # continue # start = time.time() # simila = ca.matrix_dis(da[4:len(da) - 1]) # print(time.time()-start) simi[da[0]] = ca.count_cos(da[-1]) simi = sorted(simi.items(), key=lambda x: x[1], reverse=True) num = 1 print('与第%d号判决书相似的案件有:' % j) for tu in simi: if num > 10: break if int(tu[0]) == j: continue print(tu) num += 1 count += 1 source_close(file)
def train(dic, Corpus): corpus = Corpus # remember to increse highly. sentences = corpus hashedSentences = simhash.simhash(sentences, 32) B = hashedSentences # you can try simhash directly, maybe it performs better. # hashedSentences = simhash.simhash(sentences, 128) # dataMat = np.array(hashedSentences) # lambda_, eigenVec_ = laplacian_eigenmap.laplacian_eigenmap(dataMat, 15, 32) # B=eigenVec_ input = embedding(dic, corpus) input = np.array(input) input = input.reshape(-1, 200 * 300) # [batch_size,2000] output = np.array(B) # [batch_size,32] data = np.concatenate((input, output), axis=1) net = Net() criterion = nn.MSELoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) print('Start Traning Cnn') for epoch in range(8000): # loop over the dataset multiple times generator = batch_generator(data, 30) for inputs, labels in generator: # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs, h = net(torch.Tensor(inputs.reshape(-1, 1, 200, 300))) loss = criterion(outputs, torch.Tensor(labels.reshape(-1, 32))) loss.backward() optimizer.step() # print statistics if epoch % 100 == 0: print('loss:{}'.format(loss)) print('Finished Training') torch.save(net, "cnn_model.pkl")
def train(dic, Corpus): corpus = Corpus # remember to increse highly. sentences = corpus hashedSentences = simhash.simhash(sentences, 32) B = hashedSentences # you can try simhash directly, maybe it performs better. # hashedSentences = simhash.simhash(sentences, 128) # dataMat = np.array(hashedSentences) # lambda_, eigenVec_ = laplacian_eigenmap.laplacian_eigenmap(dataMat, 15, 32) # B=eigenVec_ input = embedding(dic, corpus) input = np.array(input) input = input.reshape(-1, 20 * 300) # [batch_size,2000] output = np.array(B) # [batch_size,32] data = np.concatenate((input, output), axis=1) net = Net() criterion = nn.MSELoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) print('Start Traning Cnn') inputs = 0 for epoch in range(800): # loop over the dataset multiple times generator = batch_generator(data, 30) cnt = 0 loss_ = 0 for inputs, labels in generator: cnt += 1 # zero the parameter gradients optimizer.zero_grad() inputs = torch.from_numpy(inputs).reshape(-1, 1, 20, 300).float() labels = torch.from_numpy(labels).reshape(-1, 32).float() # forward + backward + optimize outputs, h = net(inputs) loss = criterion(outputs, labels) loss_ += loss loss.backward() optimizer.step() # print statistics if epoch % 100 == 0: print('loss:{}'.format(loss)) for tag, value in net.named_parameters(): tag = tag.replace('.', '/') writer.add_histogram(tag, value.data.cpu().numpy(), epoch + 1) #logger.histo_summary(tag, value.data.cpu().numpy(), epoch + 1) #logger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), epoch + 1) writer.add_scalar("STCC/loss", loss_ / cnt, epoch) cnt = 0 loss_ = 0 print('Finished Training') torch.save(net, "cnn_model.pkl") writer.add_graph(net, input_to_model=(inputs, )) writer.close()
else: partition_texts = set() partition_texts.add(current_id) partitions[val] = partition_texts.copy() return candidates if __name__ == '__main__': # get texts number_of_texts = int(input()) hashes = [] for i in range(int(number_of_texts)): text = input().strip() text_hash = simhash.simhash(text) hashes.append(text_hash) candidates = lsh(hashes, number_of_texts) # get queries number_of_queries = int(input()) for i in range(int(number_of_queries)): query = input().strip().split(" ") # process the query text_index = int(query[0]) max_distance = int(query[1]) target_text_hash = hashes[text_index] results = [] if text_index in candidates:
def simhash(x) : import simhash return simhash.simhash(x)
def process_IN_CREATE(self,event): print "Create file:%s"%os.path.join(event.path,event.name) print "simhash of newfile is %s"%simhash.simhash(open(os.path.join(event.path,event.name)))
#!/usr/bin/env python # encoding: utf-8 import time import getpath import simhash import os import cPickle as pickle time.clock() rootdir = '/home/ted' dirlist = getpath.fileso().getpath(rootdir,filter = ['txt','doc']) print dirlist res = [] for i in dirlist: with open(i,'r') as f: content = f.read() res.append([i,str(simhash.simhash(content))]) res.sort(key = lambda a: a[1]) with open(os.path.join(os.getcwd(),'result'),'w') as f: f.write(pickle.dumps(res)) print time.clock()
import os import subprocess import simhash as si result = os.environ['PATH'] new_result = result + '//liu//text//' os.putenv(new_result, 'PATH') print os.environ['PATH'] def find(): pass if __name__ == '__main__': s = 'To be or not to be ,this is a question' hash1 = si.simhash(s.split()) s = 'whether to be is a question' hash2 = si.simhash(s.split()) s = 'i have a question to say ,not to be or to be' hash3 = si.simhash(s.split()) print(hash1.hamming_distance(hash2), " ", hash1.similarity(hash2)) print(hash1.hamming_distance(hash3), " ", hash1.similarity(hash3))
def getFeature(dic, Corpus): corpus = Corpus[:5000] # remember to increse highly. sentences = [j[0] for j in corpus] hashedSentences = simhash.simhash(sentences, 32) B = hashedSentences # # you can try simhash directly, maybe it performs better. # hashedSentences = simhash.simhash(sentences, 128) # dataMat = np.array(hashedSentences) # lambda_, eigenVec_ = laplacian_eigenmap.laplacian_eigenmap(dataMat, 15, 32) input = getMat(dic, corpus) input = np.array(input) input = input.reshape(-1, 20 * 300) output = np.array(B) data = np.concatenate((input, output), axis=1) net = cnn_model.Net() criterion = cnn_model.nn.MSELoss() optimizer = cnn_model.optim.SGD(net.parameters(), lr=0.001, momentum=0.9) print('Start Traning Cnn') i = 0 running_loss = 0.0 for epoch in range(50): # loop over the dataset multiple times generator = cnn_model.batch_generator(data, 100) for inputs, labels in generator: # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs, h = net(torch.Tensor(inputs.reshape(-1, 1, 20, 300))) loss = criterion(outputs, torch.Tensor(labels.reshape(-1, 32))) loss.backward() optimizer.step() # print statistics running_loss += loss.item() i += 1 if i % 20 == 19: # print every 100 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 print('Finished Training') represented = [] length = len(Corpus) for i in range(0, length, 500): input = getMat(dic, Corpus[i:min(length, i + 500)]) input = np.array(input) input = input.reshape(-1, 1, 20, 300) outputs, h = net(torch.Tensor(input)) represented += h.data.numpy().tolist() print('(%d %% %d) have been Embedding.' % (i, length)) print('all sentences finished Embedding') return represented
filter_list = [] for line in open("filter.txt"): filter_list.append(line.split("\t")[0]) token = dict() frq = dict() (token, frq) = find_all_instances("data-concept-instance-relations.txt", filter_list) hash_data = [] simhash_output = open('simhash.txt', 'w') answer = open('answer.txt', 'w') inac = open("interaction.txt", 'w') for name in filter_list: sim = simhash(name, token[name], frq[name]) hash_data.append(sim) simhash_output.write(sim.name) simhash_output.write("\t") simhash_output.write(str(sim.hash)) simhash_output.write('\n') print len(hash_data) for i in range(0, len(hash_data) - 1): if (i % 200 == 0): print i for j in (range(i + 1, len(hash_data))): if (hash_data[i].hamming_distance(hash_data[j]) >= LINE): sent = hash_data[i].name + '\t' + hash_data[j].name + '\n' answer.write(sent) find_interaction(inac, hash_data[i].name, hash_data[j].name,
# a filter of what changes to notify. # # NB Tim Juchcinski reports that he needed to up # the buffer size to be sure of picking up all # events when a large number of files were # deleted at once. # results = win32file.ReadDirectoryChangesW ( hDir, 1024, True, win32con.FILE_NOTIFY_CHANGE_FILE_NAME | win32con.FILE_NOTIFY_CHANGE_DIR_NAME | win32con.FILE_NOTIFY_CHANGE_ATTRIBUTES | win32con.FILE_NOTIFY_CHANGE_SIZE | win32con.FILE_NOTIFY_CHANGE_LAST_WRITE | win32con.FILE_NOTIFY_CHANGE_SECURITY, None, None ) print results if results[0][0] == 1: pass for action, file in results: full_filename = unicode(os.path.join(path_to_watch,file)) print ACTIONS.get(action, "Unknown") if action == 1: with open(full_filename,'r') as f: content = f.read() print u'新建文件的simhash值为'+str(simhash.simhash(content))
html1 = getpage('http://www.2345.com/') html2 = getpage('http://www.hao123.com') st = time() t1 = gettags(html1) t2 = gettags(html2) print time() - st print t1 print t2 t1 = ['head', 'base', 'meta', 'title', 'meta', 'meta', 'meta', 'meta', 'meta', 'style', 'script', 'body', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'script', 'script', 'div'] t2 = ['meta', 'title', 'meta', 'meta', 'meta', 'meta', 'meta', 'style', 'script', 'script', 'body', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'script', 'script', 'div'] hash1 = simhash.simhash(t1) hash2 = simhash.simhash(t2) print hash1,hash2 print issim(hash1,hash2) domain="11.aaa.com" t = time() mdomain = "aaa.com" ip = "1.2.3.4" hash = hash1 update(mdomain,hash)
print time() - st print t1 print t2 t1 = [ 'head', 'base', 'meta', 'title', 'meta', 'meta', 'meta', 'meta', 'meta', 'style', 'script', 'body', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'script', 'script', 'div' ] t2 = [ 'meta', 'title', 'meta', 'meta', 'meta', 'meta', 'meta', 'style', 'script', 'script', 'body', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'div', 'script', 'script', 'div' ] hash1 = simhash.simhash(t1) hash2 = simhash.simhash(t2) print hash1, hash2 print issim(hash1, hash2) domain = "11.aaa.com" t = time() mdomain = "aaa.com" ip = "1.2.3.4" hash = hash1 update(mdomain, hash)
# assuming that you have a dictionary with document id as the key and the document as the value: # documents = { doc_id: doc } you can do: from simhash import simhash documents = { 1 : open('first.txt', 'r').read() , 2 : open('second.txt', 'r').read(), 3 : open('Tests/third.txt', 'r').read(), 4 : open('Tests/fourth.txt', 'r').read()} def split_hash(str, num): return [ str[start:start+num] for start in range(0, len(str), num) ] hashes = {} for doc_id, doc in documents.items(): print(doc_id) print(doc) hash = simhash(doc) # you can either use the whole hash for higher precision or split into chunks for higher recall hash_chunks = split_hash(hash, 4) for chunk in hash_chunks: if chunk not in hashes: hashes[chunk] = [] hashes[chunk].append(doc_id) # now you can print the duplicate documents: for hash, doc_list in hashes: if doc_list > 1: print("Duplicates documents: ", doc_list)
for r in result: varst += str(r.name) return varst skipexts = ['.gif', '.exe', '.pyc', '.o', '.a','.dll','.lib','.pdb','.mdb'] # ignore binary files scanexts = ['.inc','.php'] strlen = 0 lexer = phplex.lexer.clone() lexer.filename = None p = [] s1 = file('d:\\office\\2008.php').read() log = Log() import simhash hash1 =simhash.simhash(getvarlist(s1)) if __name__ == '__main__': if os.name =='nt': os.system('color 0a') os.system('mode con cols=155 lines=300') args = sys.argv if len(args) == 2: scan(args[1]) elif len(args) == 3: scan( args[2]) elif len(args) == 4: scan(args[2]) else: pass
import random from simhash import simhash class Permute(object): def __init__(self, size, count): self.size = size self.count = count self.permutation = [] ks = set() while len(self.permutation) < count: a = [i for i in range(size)] random.shuffle(a) k = '.'.join([str(x) for x in a]) if k not in ks: self.permutation.append(a) ks.add(k) def permute(self, s): for i in range(self.count): yield ''.join([s[self.permutation[i][j]] for j in range(self.size)]) N = 128 sh = simhash(sys.argv[1], N) print sh p = Permute(size=7, count=4) for x in p.permute('1234567'): print x