def get_chinese_similarity(s1, s2): """ Get the similarity of two chinese word """ hash1 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s1)) ]) hash2 = simhash([ smart_unicode(x) for x in seg_txt(smart_str(s2)) ]) return hash1.similarity(hash2)
def compute_similarities(text, models, count=None): """Finds items that are similar to the specified text :param text: The text to be used for comparison :param models: The list of models to be compared against text Each of the entries should have a simhash property :param count: The no. of similar items to return """ # Get the simhash of the submitted message _hash = simhash(util.unicodeToAscii(text)) candidates, scores = {}, [] # TODO: Investiage ways of speeding this - complexity is O(n) for model in models: target = simhash(hash=long(model.simhash)) if long(target) == long(_hash): continue similarity = _hash.similarity(target) if similarity >= similarity_threshold: scores.append((model.id, similarity)) candidates[model.id] = model if len(scores) == 0: return [] scores.sort(key=lambda x: x[1], reverse=True) result_size = max_similar_messages if count is None else count retval = [] for x in range(result_size): message_dict = candidates[scores[x][0]].as_dict() del message_dict['simhash'] message_dict['score'] = scores[x][1] retval.append(message_dict) return retval
def is_similar_page(self, page1, page2): hash1 = simhash(page1) hash2 = simhash(page2) similar = hash1.similarity(hash2) if similar > 0.85: # 当前阈值定义为0.85 return True else: return False
def get_simhash(shingles1, shingles2, simhash_bytes=simhash_bytes, hashfunc=hashfunc): sim1 = simhash(shingles1, hashbits=simhash_bytes) sim2 = simhash(shingles2, hashbits=simhash_bytes) return sim1.similarity(sim2)
def simhash_hamming_corrcoef(A, B): str_A = [] str_B = [] for i in xrange(len(A)): str_A.append(str(A[i])) str_B.append(str(B[i])) hash_A = simhash(','.join(str_A)) hash_B = simhash(','.join(str_B)) return hash_A.similarity(hash_B)
def _get_diff_ratio(self, a_str, b_str): ''' ''' if a_str == None or b_str == None: return 0 a_hash = simhash(a_str.split()) b_hash = simhash(b_str.split()) ratio = a_hash.similarity(b_hash) return ratio
def get_simhash(sentence1, sentence2): hash1 = simhash(get_cut_sentence(sentence1)) hash2 = simhash(get_cut_sentence(sentence2)) # print(hash1) # print(hash2) similarity = hash1.similarity(hash2) print(similarity) if similarity > 0.8: return similarity else: return False
def decide(forktionary, message): message_hash = simhash(message.split()) similarity = 0 val = None for key, value in forktionary.iteritems(): key_hash = memcache.get(key) if not key_hash: key_hash = simhash(key.split()) memcache.set(key, key_hash) sim = message_hash.similarity(key_hash) if sim > similarity: similarity = sim val = value return val
def run(self) -> dict: sim_hash_dict = dict() # tqdm 是一个显示进度条的python库 # print(self.message_list) # print(type(self.df)) for idx, value in self.df.iterrows(): # hashbits,比较的hash位数 # print(value) sim = simhash(value['Content'], hashbits=self.hashbits) sim_dict = dict(message=value['Content'], simhash=sim, LineId=value['LineId']) if sim.hash in sim_hash_dict.keys(): sim_list = sim_hash_dict[sim.hash] if self.keep_same_count == 0 or self.keep_same_count <= len( sim_list): sim_list.append(sim_dict) else: print("已经达到分组保存容量的最大值,跳过词条记录") else: sim_list = list() sim_list.append(sim_dict) sim_hash_dict[sim.hash] = sim_list total_group = len(sim_hash_dict.keys()) print('After Simhash Reduce, total:%s bin(s)' % len(sim_hash_dict.keys())) print("数据压缩比率为:%s" % (1 - total_group / len(self.df))) return sim_hash_dict
def fit(self, X, y=None): texts = self._preprocess(X[self.field]) print 'DEBUG: preprocess done' self.hashcodes = map(lambda s: simhash(s), texts) self.y = np.asarray(y) print 'DEBUG: fit done...' return self
def __call__(self, dict_): text = dict_['text'] self_simhash = simhash(text) fuzzy_count = 0 sum_other_votes = 0 for other_simhash in self.simhashes: if self_simhash.similarity(other_simhash) > self.threshold: # increment the votes of the others other_votes = self.votes[other_simhash.hash] = self.votes.get( other_simhash.hash, 1) + 1 fuzzy_count += 1 sum_other_votes += other_votes # should self.votes be elevated based on fuzzy_count? self.votes[self_simhash.hash] = self.votes.get(self_simhash.hash, 0) + 1 # maybe normalize based on the number of total votes? dict_['fuzzy_count'] = fuzzy_count dict_['fuzzy_votes'] = sum_other_votes # store simhash in global state now that we've finished processing self.simhashes.append(self_simhash) return dict_
def get_feature_from_content(content): """ Generate features from a HTTP content Returns: a dict contains features, format is: <name_of_feature>:value_of_feature """ feature_dict = {} """ tl=re.findall("<title.*?\/title>",c) kw=re.findall('(?<=<meta name="keywords" content=").*(?=\/>)',c) tmpl=re.findall('(?<=<meta name="generator" content=").*(?=\/>)',c) uid=re.findall("UA-\d{5,10}-\d{1,4}",c) dm=re.findall("(?<=_gaq.push\(\['_setDomainName'),.*(?=\]\);)",c) tl="" if tl==[] else tl[0] kw="" if kw==[] else kw[0] tmpl="" if tmpl==[] else tmpl[0] uid="" if uid==[] else uid[0] dm="" if dm==[] else dm[0] """ feature_dict["ctitle"] = re.findall("<title.*?\/title>", content) feature_dict["ckws"] = re.findall('(?<=<meta name="keywords" content=").*(?=\/>)', content) feature_dict["ctmpl"] = re.findall('(?<=<meta name="generator" content=").*(?=\/>)', content) feature_dict["gid"] = re.findall("UA-\d{5,10}-\d{1,4}", content) feature_dict["dm"] = re.findall("(?<=_gaq.push\(\['_setDomainName'),.*(?=\]\);)", content) for k in feature_dict: feature_dict[k] = "" if not feature_dict[k] else feature_dict[k][0] feature_dict["chash"] = str(simhash(content)) content = "".join(content.split()) feature_dict["clen"] = len(content) return feature_dict
def percentages(self, vmx, threshold=10): elems_hash = set() signature_module = sign.Signature(vmx) for _cls in vmx.get_classes(): if _cls.is_external(): continue _class = _cls.get_vm_class() for method in _class.get_methods(): code = method.get_code() if code is None: continue # FIXME: shouldnt here not apply the same rules as on import? # Like skip constructors and too short methods? for i in signature_module.get_method_signature( method, predef_sign=sign.PredefinedSignature. SEQUENCE_BB).get_list(): elems_hash.add(int(simhash(i))) ret, info = self.db.elems_are_presents(elems_hash) sorted_ret = self._eval_res(ret, info, threshold) info = defaultdict(list) for k, values in sorted_ret.items(): for j in sorted(values, key=itemgetter(1), reverse=True): info[k].append([j[0], j[1]]) return info
def add_message(deployment_id): """Adds a new message for the deployment in :deployment_id The input parameters are: message: string :param deployment_id: the id of the deployment """ if not request.json: abort(400) _post = request.json if 'origin_message_id' not in _post and 'content' not in _post: abort(400) # Does the deployment exist deployment = Deployment.by_id(deployment_id) if deployment is None: abort(404) _hash = simhash(util.unicodeToAscii(_post['content'])) message = Message(deployment_id=deployment_id, origin_message_id=_post['origin_message_id'], content=_post['content'], simhash=str(_hash)) message.create() return jsonify(message.as_dict())
def hash_column_rows(column): ''' Input: column <Series> one column from pandas data frame Output: [] hash <str> list of hash strings as the column representation ''' # column.apply(hash) # collect/cluster similar hashes into baskets # baskets = [] # for row in column: # # hash only the first row # row_value = str(row).strip() # row_hash = hash_func.compute_hash(row_value) # basket_item = row_value # if basket_item not in baskets: # baskets.append(basket_item) # all fields are identical -> single basket for the column # if len(baskets) == 1: # return baskets[0] # else: baskets = simhash(column) return baskets
def hash_column_rows(column): ''' Input: column <Series> one column from pandas data frame Return: [] hash <str> list of hash strings as the column representation ''' # column.apply(hash) # collect/cluster similar hashes into baskets # baskets = [] # for row in column: # # hash only the first row # row_value = str(row).strip() # row_hash = hash_func.compute_hash(row_value) # basket_item = row_value # if basket_item not in baskets: # baskets.append(basket_item) # all fields are identical -> single basket for the column # if len(baskets) == 1: # return baskets[0] # else: baskets = simhash(column) return baskets
def transform(self, X): texts = self._preprocess(X[self.field]) hashcodes = map(lambda s: simhash(s), texts) print 'DEBUG: finish transform hashing' #similarity_indices = map(lambda h: self._find_neighbor(h), hashcodes) similarity_indices = Parallel(n_jobs=-1)(delayed(Simhash_find_neighbor)(self.hashcodes, h) for h in hashcodes) return self.y[similarity_indices].reshape(-1, 1)
def add(self, dx, name, sname, regexp_pattern=None, regexp_exclude_pattern=None): """ Add all classes which match certain rules to the database. Only methods with a length >= 50 are added. No constructor (static and normal) methods are added. Additional exlcludes or whitelists can be defined by classnames as regexes. :param androguard.core.analysis.analysis.Analysis dx: :param str name: name, the first key in the tree :param str sname: subname, the second key in the tree :param str regexp_pattern: whitelist regex pattern :param str regexp_exclude_pattern: blacklist regex pattern """ sign_module = sign.Signature(dx) for _cls in dx.get_classes(): if _cls.is_external(): continue _class = _cls.get_vm_class() # whitelist if regexp_pattern and not re.match(regexp_pattern, _class.get_name()): continue # blacklist if regexp_exclude_pattern and re.match(regexp_exclude_pattern, _class.get_name()): continue print("\tadding", _class.get_name()) for method in _class.get_methods(): code = method.get_code() if not code or method.get_length() < 50 or method.get_name( ) in ("<clinit>", "<init>"): continue buff_list = sign_module.get_method_signature( method, predef_sign=sign.PredefinedSignature.SEQUENCE_BB).get_list( ) if len(set(buff_list)) == 1: continue for e in buff_list: self.db.add_element(name, sname, str(_class.get_name()), method.get_length(), int(simhash(e)))
def process_source_html(html_src_file, visit_info): if isfile(html_src_file): visit_info.html_src_file = html_src_file html_src = open(html_src_file).read() visit_info.html_src_size = len(html_src) visit_info.fx_conn_error = is_moz_error_txt(html_src) visit_info.html_src_hash = mmh3.hash(html_src) visit_info.html_src_simhash = simhash(html_src) if not visit_info.fx_conn_error: visit_info.page_title = get_page_title_from_html_src(html_src) populate_site_generator(html_src, visit_info)
def doit(): n = 0 for i in coll.find(): h = simhash(i['firstName'] + i['lastName'], hashbits=bits) ha = bitstring.pack('uint:%s' % bits, long(h)) for j in xrange(0, bits): if limit != None and n > limit: return #i['h%s' % j] = ha.hex #print ha.hex print '%s:%s' % (ha.uint, i['n']) ha.ror(bits / bits) n += 1
def computeSimilarities(self, msg): """ returns a set of message id's with similarity score, sorted by similarity score. I recommend using >=0.875 to define 'near-dup'. :return [('1', 0.9), ('2', 0.8), ...], sorted by the real value (second element of each item) in decreasing order. """ simhashCode = simhash(unicodeToAscii(msg['description'])) retList = [] for i in range(len(self._simhashList)): id = self._simhashList[i][0] val = self._simhashList[i][1].similarity(simhashCode) retList.append((id, val)) retList.sort(key=lambda x: x[1], reverse=True) return retList
def is_similar(title): db = redis.StrictRedis(host=get_dbhost(), port=get_dbport(), db=0) result = jieba.analyse.extract_tags(title, topK=10) hash1 = simhash(result) title_md5 = cal_md5(title) title_set = set() for word in result: title_set = title_set | db.smembers(word) if not title_set: insert_title(result, title_md5, hash1) return False for every_md5_title in title_set: hash_list = db.smembers(every_md5_title) hash2 = hash1 for i in hash_list: hash2 = int(i) break if hamming_distance(hash1, hash2) < 3: return True insert_title(result, title_md5, hash1) return False
def train(self, messageList): """Takes list of messages. each message is a dictionary. """ assert False, "Not implemented yet" #--- save to _messages, and compute their hashcodes for simhash self._messageMap = dict(((v['id'], v) for v in messageList)) self._simhashList = [(v['id'], simhash(unicodeToAscii(v['description']))) for v in messageList] # self._messages = map(lambda x: x['description'], messageList); # self._simhashes = map(lambda x: # simhash(unicodeToAscii(x['description'])), messageList); #--- collect category list categorySet = set() for msg in messageList: categorySet.update(msg['categories']) #--- update categories categories = sorted(list(categorySet)) #--- train classifiers minFreq = 5 # 1 unigramExtractor = DssgUnigramExtractor() def dssgVectorizerGenerator(): return DssgVectorizerUnigramCount(unigramExtractor, minFreq) def dssgBinaryClassifierTrainer(train): return DssgBinaryClassifierSVC.train(train, dssgVectorizerGenerator(), balance=False) categoryClassifier = DssgCategoryClassifier.train( dssgBinaryClassifierTrainer, messageList, dssgVectorizerGenerator) self._categoryClassifier = categoryClassifier return self
def train(self, messageList): """Takes list of messages. each message is a dictionary. """ assert False, "Not implemented yet" #--- save to _messages, and compute their hashcodes for simhash self._messageMap = dict(((v['id'], v) for v in messageList)) self._simhashList = [(v['id'], simhash(unicodeToAscii( v['description']))) for v in messageList] # self._messages = map(lambda x: x['description'], messageList); # self._simhashes = map(lambda x: # simhash(unicodeToAscii(x['description'])), messageList); #--- collect category list categorySet = set() for msg in messageList: categorySet.update(msg['categories']) #--- update categories categories = sorted(list(categorySet)) #--- train classifiers minFreq = 5 # 1 unigramExtractor = DssgUnigramExtractor() def dssgVectorizerGenerator(): return DssgVectorizerUnigramCount(unigramExtractor, minFreq) def dssgBinaryClassifierTrainer(train): return DssgBinaryClassifierSVC.train( train, dssgVectorizerGenerator(), balance=False) categoryClassifier = DssgCategoryClassifier.train( dssgBinaryClassifierTrainer, messageList, dssgVectorizerGenerator) self._categoryClassifier = categoryClassifier return self
def __call__(self, dict_): text = dict_['text'] self_simhash = simhash(text) fuzzy_count = 0 sum_other_votes = 0 for other_simhash in self.simhashes: if self_simhash.similarity(other_simhash) > self.threshold: # increment the votes of the others other_votes = self.votes[other_simhash.hash] = self.votes.get(other_simhash.hash, 1) + 1 fuzzy_count += 1 sum_other_votes += other_votes # should self.votes be elevated based on fuzzy_count? self.votes[self_simhash.hash] = self.votes.get(self_simhash.hash, 0) + 1 # maybe normalize based on the number of total votes? dict_['fuzzy_count'] = fuzzy_count dict_['fuzzy_votes'] = sum_other_votes # store simhash in global state now that we've finished processing self.simhashes.append(self_simhash) return dict_
def process_item(self, item, spider): #,item['time'] pinyin_content = ' '.join( [i for i in lazy_pinyin(item['content'].strip()) if i.strip()]) pinyin_content = str(pinyin_content) #hashbits = simhash(item['content'], hashbits=32).hex() hashbits = simhash(pinyin_content, hashbits=64).hex() bitstr = str(bin(int(hashbits, 16)))[3:] for i in range(64 - len(bitstr)): bitstr = '0' + bitstr print bitstr, len(bitstr) # manage duplicate url if [i for i in JobModel.where(url=item['url']).select()]: return item # manage duplicate content,using google simhash if has_dup(bitstr, item['time'], item['url']): return item insert_dup(bitstr, item['url'], item['time']) jobitem = JobModel() jobitem.title = item['title'] jobitem.url = item['url'] jobitem.email = item['email'] jobitem.content = item['content'] jobitem.time = item['time'] jobitem.type = item['type'] # jobitem.tags = item['tags'] tagobj = Tag() itemtag = tagobj.gettag(item['content'], item['title']) jobtag = 'nontec' for tag in itemtag.split("\t"): if u'技术' in tag: jobtag = 'tec' jobitem.jobtag = jobtag jobitem.tags = str(itemtag) jobitem.save() # create tag if necessary tags = [] for tag in itemtag.split("\t"): tagmodel = TagModel.where(tag=tag).select() if not [i for i in tagmodel]: tagmodel = TagModel() tagmodel.tag = tag tagmodel.save() tagmodel = TagModel.where(tag=tag).select() for i in tagmodel: tags.append(int(i.id)) # save the tags for tag in tags: jobtagobj = JobCrossTagModel() jobtagobj.url = item['url'] jobtagobj.tagid = tag jobtagobj.type = item['type'] jobtagobj.time = item['time'] jobtagobj.save() return item
def add_report(deployment_id): """Adds a new report to the deployment specified by the ``deployment_id`` parameter Input parameters: description: string - Description of the report categories: array of integers - category ids :param deployment_id: the id of the deployment """ verify_deployment(deployment_id) errors = {} _post = request.json # Check for fields if 'origin_report_id' not in _post: errors['origin_report_id'] = 'The report id is missing' if 'title' not in _post: errors['title'] = 'The report title is missing' if 'description' not in _post: errors['description'] = 'The report description is missing' if 'categories' not in _post or len(_post['categories']) == 0: errors['categories'] = 'The report categories must be specified' # Did we encounter any errors? if len(errors) > 0: app.logger.error("There are some errors in the request %r" % errors) abort(400) # Does the specified report already exist? _report = db.session.query(Report).\ filter(Report.origin_report_id == _post['origin_report_id'], Report.deployment_id == deployment_id).first() if not _report is None: app.logger.error("The report %s has already been registered" % _post['origin_report_id']) abort(400) # Get the categories categories = db.session.query(Category).\ filter(Category.deployment_id == deployment_id, Category.origin_category_id.in_(_post['categories'])).all() # Have the specified category ids been registered? if len(categories) == 0: app.logger.error("The specified categories are invalid") abort(400) # Compute the simhash on the report description _hash = simhash(util.unicodeToAscii(_post['description'])) report = Report(deployment_id=deployment_id, origin_report_id=_post['origin_report_id'], title=_post['title'], description=_post['description'], simhash=str(_hash)) # Create the report report.create() # Save the report categories report_categories = [] for category in categories: rc = ReportCategory(report_id=report.id, category_id=category.id) report_categories.append(rc) ReportCategory.create_all(report_categories) return jsonify(report.as_dict())
#!/usr/bin/env python import pymongo import bitstring from hashes.simhash import simhash connection = pymongo.Connection("localhost", 27017) db = connection.pace coll = db.people for i in coll.find(): h = simhash(i['firstName'] + i['lastName'], hashbits=64) ha = bitstring.pack('uint:64', long(h)) for j in xrange(0, 8): i['h%s' % j] = ha.hex #print ha.hex ha.ror(64 / 8) coll.update({'n': i['n']}, i)
def compute_hash(self, text): return simhash(text, hashbits=self.hashbits) #.hex()
# -*- coding:utf-8 -*- # @author:Eric Luo # @file:py-hash0.1.py # @time:2017/5/24 0024 16:31 from hashes.simhash import simhash hash1 = simhash('This is a test string one.') hash2 = simhash('This is a test string TWO.') print(hash1,hash2)
def get_similarity(s1, s2): """ Get the similarity of two english word """ return simhash(s1).similarity(simhash(s2))
exact, subs, bags = count_matches(k1[1:], k2[1:]) lenOk = len(w1) > 2 and len(w2) > 2 cond1 = exact > 1 and lenOk and (w1 == w2 or w1.find(w2) != -1 or w2.find(w1) != -1) cond2 = w1 == w2 and lenOk and len(k2) == 1 and len(k1) == 2 cond3 = exact >= 2 and len(k2) == 3 and len(k1) == 3 cond4 = w1 == w2 and lenOk and ((bags == 1 and len(k1)+len(k2) <= 5) or (bags == 2 and len(k1)+len(k2) <= 7)) if cond1 or cond2 or cond3 or cond4: ds.union(p2, q2) g.write("%s\n%s\n\n" % (p1, q1)) g.close() remaining = strings_remaining(strings, ds) print 'merged neighbours, remaining: %d' % len(remaining) hashes, dglinks = [], [] for k,v in remaining: hashes.append(simhash(k, hashbits=HB)) dglinks.append(set(digit_in[k]).union(set(digit_out[k]))) print 'hashing done' merging = find_merges_by_hash(remaining, hashes, dglinks) print "indices found: %d" % len(merging) merge_sets(ds, remaining, merging) print 'merged by hashes' write_mapping(digits, ds, strings, N) print 'done'
from hashes.simhash import simhash if __name__ == '__main__': f = open('flat.txt', 'r') #f = open('thingiverse_all_names.csv') data = [line.strip() for line in f.readlines()] f.close() # print data all_hashes = dict([(d, simhash(d)) for d in data]) for k, h in all_hashes.items(): print "%s %s" % (k, h) print all_hashes['Flatpack Bunny'].similarity(h)
def run(self, obj, config): self.config = config self.obj = obj user = self.current_task.user tlp_value = self.config.get("tlp_value", "tlp_value") url = obj['value'] if not (obj._meta['crits_type'] == 'Indicator' and obj['ind_type'] == 'URI'): self._error('This object type cannot use service Url analysis.') return False #verify url http or https if url.startswith('https://') or url.startswith('http://'): #put url in file dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36" ) driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=[ '--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false' ]) driver.set_window_size(1024, 768) driver.set_page_load_timeout(30) driver.get(url) time.sleep(3) #driver.save_screenshot('testing1.png') screen = driver.get_screenshot_as_png() ofile = io.BytesIO() im = Image.open(StringIO.StringIO(screen)) im.save(ofile, 'PNG', optimize=True) ofile.seek(0) res = add_screenshot(description='Render of a website URL', tags=None, method=self.name, source=obj.source, reference=None, analyst=self.current_task.user.username, screenshot=ofile, screenshot_ids=None, oid=obj.id, tlp=tlp_value, otype="Indicator") if res.get('message') and res.get('success') == True: self._warning("res-message: %s id:%s" % (res.get('message'), res.get('id'))) self._add_result('ScreenShot URL', res.get('id'), {'Message': res.get('message')}) #parse HAR har = driver.get_log('har') if type(har) is list and har: if type(har[0]) is dict and 'message' in har[0]: #change unicode to json try: har[0]['message'] = json.loads(har[0]['message']) except: self._warning('Har log error to parse json') if type(har[0]['message'] ) is dict and 'log' in har[0]['message'] and type( har[0]['message']['log'] ) is dict and 'pages' in har[0]['message']['log']: if type(har[0]['message']['log']['pages'] ) is list and har[0]['message']['log'][ 'pages'] and type(har[0]['message']['log'] ['pages'][0]) is dict: title = 'Result of ' if 'id' in har[0]['message']['log']['pages'][0]: title += har[0]['message']['log']['pages'][0][ 'id'] if 'title' in har[0]['message']['log']['pages'][0]: self._add_result( title, 'Title', { 'value': har[0]['message']['log']['pages'][0] ['title'] }) #parse each request and response if 'entries' in har[0]['message']['log'] and type( har[0]['message']['log']['entries'] ) is list and har[0]['message']['log']['entries']: count = 1 type_r = ['cookies', 'queryString', 'headers'] type_rs = ['content', 'timings', 'cache'] for elem_rr in har[0]['message']['log']['entries']: for k, v in elem_rr.iteritems(): if type(v) is not dict: self._add_result( title + ' -- Informations Request & Response num:' + str(count), k, {'value': v}) for k, v in elem_rr.iteritems(): if type(v) is dict: for kx, vx in v.iteritems(): self._add_result( title + ' -- Informations Request & Response num:' + str(count) + ' -- ' + str(k), kx, {'value': vx}) count += 1 #save page source in rawdata if not user.has_access_to(RawDataACL.WRITE): self._info(driver.page_source.encode('utf8')) else: #can write result = handle_raw_data_file( driver.page_source.encode('utf8'), obj.source, user=self.current_task.user, description="Code page for URL: %s" % url, title=url, data_type="Text", tool_name=self.name, tool_version=self.version, tool_details=self.description) if result['success']: obj.add_relationship( result['object'], RelationshipTypes.CONTAINED_WITHIN, analyst=self.current_task.user.username, rel_reason="Extracted from URI") obj.save() self._add_result( 'Code Page', url, { 'RawData TLO ID': result['_id'], 'md5 file': md5(driver.page_source.encode('utf8')).hexdigest(), 'simhash': str(simhash(driver.page_source.encode('utf8'))) }) driver.close() driver.service.process.terminate() time.sleep(1) #get certificat information - ref: https://stackoverflow.com/questions/30862099/how-can-i-get-certificate-issuer-information-in-python #because selenium not functionnality if url.startswith('https://'): try: host = urlparse(url).hostname port = urlparse(url).port if (port is None): port = 443 s = socks.socksocket() if settings.HTTP_PROXY: type_proxy = socks.PROXY_TYPE_SOCKS5 if settings.HTTP_PROXY.startswith('http://'): type_proxy = socks.PROXY_TYPE_HTTP s.setproxy(type_proxy, urlparse(settings.HTTP_PROXY).hostname, port=urlparse(settings.HTTP_PROXY).port) s.connect((host, port)) ss = ssl.wrap_socket(s) pem_data = ssl.DER_cert_to_PEM_cert(ss.getpeercert(True)) ss.close() s.close() cert = M2Crypto.X509.load_cert_string(pem_data) #put ssl information self._add_result( 'SSL informations', 'Subject', {'value': str(cert.get_subject().as_text())}) self._add_result( 'SSL informations', 'Issuer', {'value': str(cert.get_issuer().as_text())}) self._add_result('SSL informations', 'Version', {'value': str(cert.get_version())}) self._add_result('SSL informations', 'Date before', {'value': str(cert.get_not_before())}) self._add_result('SSL informations', 'Date after', {'value': str(cert.get_not_after())}) self._add_result('SSL informations', 'Serial Number', {'value': str(cert.get_serial_number())}) self._add_result('SSL informations', 'Verify', {'value': str(cert.verify())}) self._add_result('SSL informations', 'Fingerprint MD5', {'value': str(cert.get_fingerprint())}) for i in range(0, cert.get_ext_count()): self._add_result( 'SSL informations Extension', str(cert.get_ext_at(i).get_name()), {'value': str(cert.get_ext_at(i).get_value())}) #https://www.heikkitoivonen.net/m2crypto/api/M2Crypto.X509-module.html except: self._error('Error: get certificate informations.') self._info(str(cert)) driver.service.process.kill() driver.quit() self._info('END')
def sh(f): return [simhash(open(f).read()), f]