def distance(): data = request.args.get('data', '') title1 = request.args.get('title1', '') title2 = request.args.get('title2', '') data = eval(data) print data res = {'title1': title1, 'title2': title2} nums = data['ids'].values() for num in nums: res[u'原差距'] = mongo_spider.sim.distance(data['num'], num) title_num1 = Simhash(title1).value - sys.maxint title_num2 = Simhash(title2).value - sys.maxint res[u'标题差距'] = mongo_spider.sim.distance(title_num1, title_num2) article = article_mongo.find_one({'_id': data['_id']}, { 'content': 1, 'title': 1 }) content_num1 = Simhash(article['content']).value - sys.maxint text_num = Simhash(article['title'] + default + html2text(article['content'])).value - sys.maxint res['content1'] = article['content'] cursors = article_mongo.find({"_id": { "$in": data['ids'].keys() }}, { 'content': 1, 'title': 1 }) for cursor in cursors: content_num2 = Simhash(cursor['content']).value - sys.maxint text_num2 = Simhash(cursor['title'] + default + html2text(cursor['content'])).value - sys.maxint res[u'正文差距'] = mongo_spider.sim.distance(content_num1, content_num2) res[u'新差距'] = mongo_spider.sim.distance(text_num2, text_num) res['content2'] = cursor['content'] d = render_html_diff(res['content1'], res['content2']) # result = list(d.compare(res['content1'], res['content2'])) # res['res'] = ''.join(result.spilt('+')) # from pprint import pprint as _pprint # _pprint(result) res['d'] = d return render_template('test/detail.html', data=res)
def article(self): score = len(html2text(self.content)) + len(self.imgs) * 50 if len(self.title) < 4 or score < 100: return None return { 'title':self.title, 'content':self.content, 'src_name':self.xsource['text'], 'src_link':self.xsource['link'], 'pubtime':self.pubtime, 'pages':self.pages, 'imgs':self.imgs, }
def article(self): score = len(html2text(self.content)) + len(self.imgs) * 50 if len(self.title) < 4 or score < 100: return None return { 'title': self.title, 'content': self.content, 'src_name': self.xsource['text'], 'src_link': self.xsource['link'], 'pubtime': self.pubtime, 'pages': self.pages, 'imgs': self.imgs, }
def tests(): data = request.args.get('data', '') assert data is not None res, data = {}, eval(data) article = article_mongo.find_one({'_id': data['_id']}, { 'content': 1, 'title': 1 }) cursors = article_mongo.find({"_id": { "$in": data['ids'].keys() }}, { 'content': 1, 'title': 1 }) for cursor in cursors: for i in xrange(1, 10): text_num1 = Simhash(default * i + article['title'] + html2text( article['content'])).value - sys.maxint text_num2 = Simhash(default * i + cursor['title'] + html2text( cursor['content'])).value - sys.maxint text_num3 = Simhash(article['title'] + default * i + html2text( article['content'])).value - sys.maxint text_num4 = Simhash(cursor['title'] + default * i + html2text( cursor['content'])).value - sys.maxint text_num5 = Simhash(article['title'] + html2text(article['content']) + default * i).value - sys.maxint text_num6 = Simhash(cursor['title'] + html2text(cursor['content']) + default * i).value - sys.maxint text_num7 = Simhash(default * i + article['title'] + default * i + html2text(article['content']) + default * i).value - sys.maxint text_num8 = Simhash(default * i + cursor['title'] + default * i + html2text(cursor['content']) + default * i).value - sys.maxint distance = mongo_spider.sim.distance res[i] = [ distance(text_num1, text_num2), distance(text_num3, text_num4), distance(text_num5, text_num6), distance(text_num7, text_num8) ] return render_template('test/tests.html', data=res)
def distance(): data = request.args.get('data', '') title1 = request.args.get('title1', '') title2 = request.args.get('title2', '') data = eval(data) print data res = {'title1':title1, 'title2':title2} nums = data['ids'].values() for num in nums: res[u'原差距'] = mongo_spider.sim.distance(data['num'], num) title_num1 = Simhash(title1).value - sys.maxint title_num2 = Simhash(title2).value - sys.maxint res[u'标题差距'] = mongo_spider.sim.distance(title_num1, title_num2) article = article_mongo.find_one({'_id':data['_id']}, {'content':1, 'title':1}) content_num1 = Simhash(article['content']).value - sys.maxint text_num = Simhash(article['title'] + default + html2text(article['content']) ).value - sys.maxint res['content1'] = article['content'] cursors = article_mongo.find({"_id":{"$in" : data['ids'].keys()}}, {'content':1, 'title':1}) for cursor in cursors: content_num2 = Simhash(cursor['content']).value - sys.maxint text_num2 = Simhash(cursor['title'] + default + html2text(cursor['content'])).value - sys.maxint res[u'正文差距'] = mongo_spider.sim.distance(content_num1, content_num2) res[u'新差距'] = mongo_spider.sim.distance(text_num2, text_num ) res['content2'] = cursor['content'] d = render_html_diff(res['content1'], res['content2']) # result = list(d.compare(res['content1'], res['content2'])) # res['res'] = ''.join(result.spilt('+')) # from pprint import pprint as _pprint # _pprint(result) res['d'] = d return render_template('test/detail.html', data=res)
def calc(): res = {'code': 1} id1 = request.args.get('_id1', '') id2 = request.args.get('_id2', '') if not id1 or not id2: res['msg'] = 'article id was null' return testjsonify(res) article1 = mongo_iweb.spider_article.find_one({'_id':id1}) article2 = mongo_iweb.spider_article.find_one({'_id':id2}) if not article1 or not article2: res['msg'] = 'article id was error' return testjsonify(res) try: text1 = article1['title'] + html2text(article1['content']) num1 = Simhash(text1).value - sys.maxint text2 = article2['title'] + html2text(article2['content']) num2 = Simhash(text2).value - sys.maxint distance = mongo_iweb.sim.distance(num1, num2) except Exception, e: res['msg'] = str(e) return testjsonify(res)
def sim(handler, key, article, ext): mongo = handler.mongo mongo.sim.unset(article['_id'], article['sim']) if 'content' not in ext or not ext['content']: ext['content'] = handler.text_file.get(article['content']) if not ext['content']: raise ValueError('load content failed.') ext['content'] = content.decode('utf-8') text = article['title'] + html2text(ext['content']) num = Simhash(text).value - sys.maxint near = mongo.sim.near(num) key = mongo.sim.save(article['_id'], num, near) return {'sim':key}
def segment(handler, key, article, ext): if 'content' not in ext or not ext['content']: ext['content'] = handler.text_file.get(article['content']) if not ext['content']: raise ValueError('load content failed.') ext['content'] = content.decode('utf-8') words = segmentor.seg(article['title'], html2text(ext['content'])) save_words(handler, article, words, ext) return { 'pubdate': article['pubdate'], 'id': article['id'], 'words': article['words'], 'tags': article['tags'], }
def sim(handler, key, article, ext): mongo = handler.mongo mongo.sim.unset(article['_id'], article['sim']) if 'content' not in ext or not ext['content']: ext['content'] = handler.text_file.get(article['content']) if not ext['content']: raise ValueError('load content failed.') ext['content'] = content.decode('utf-8') text = article['title'] + html2text(ext['content']) num = Simhash(text).value - sys.maxint near = mongo.sim.near(num) key = mongo.sim.save(article['_id'], num, near) return {'sim': key}
def article(self): try: score = len(html2text(self.content)) + len(self.imgs) * 50 except lxml.etree.ParserError: score = 0 if len(self.title) < 4 or score < 50: return None return { 'title': self.title, 'content': self.content, 'src_name': self.xsource['text'], 'src_link': self.xsource['link'], 'pubtime': self.pubtime, 'pages': self.pages, 'imgs': self.imgs, }
def article(self): try: score = len(html2text(self.content)) + len(self.imgs) * 50 except lxml.etree.ParserError: score = 0 if len(self.title) < 4 or score < 50: return None return { 'title':self.title, 'content':self.content, 'src_name':self.xsource['text'], 'src_link':self.xsource['link'], 'pubtime':self.pubtime, 'pages':self.pages, 'imgs':self.imgs, }
def get_page(self) -> Page: raw_page = self.get_raw_page() raw_posts = raw_page.find('article') if not raw_posts: logger.warning( "No raw posts (<article> elements) were found in this page.") if logger.isEnabledFor(logging.DEBUG): content = textwrap.indent( utils.html2text(raw_page.html), prefix='| ', predicate=lambda _: True, ) sep = '+' + '-' * 60 logger.debug("The page url is: %s", self.response.url) logger.debug("The page content is:\n%s\n%s%s\n", sep, content, sep) return raw_posts
def sim(article, content): while True: try: # if 'sim' in article and article['sim'] != False: # iweb.sim.unset(article['long'], article['sim']) content = html2text(content) if len(content) < 20: text = article['title'] + content else: text = content num = Simhash(text).value - sys.maxint near = iweb.sim.near(num) key = iweb.sim.save(article['long'], num, near) return key except (pymongo.errors.OperationFailure, IndexError), e: print str(e) gevent.sleep(1)
def test_segment(url): url = url.split('#')[0].split('?')[0] if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector ).content return json.dumps({ 'url':url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content)) })
def test_segment_all(): urls = get_test_urls() res = [] for url in urls: html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector ).content res.append({ 'url':url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content, code=False)) }) return json.dumps(res)
def test_segment(url): url = url.split('#')[0].split('?')[0] if not url.startswith('http://'): return 'url is not startswith http://' add_test_url(url) html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector).content return json.dumps({ 'url': url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content)) })
def test_segment_all(): urls = get_test_urls() res = [] for url in urls: html = get_or_cache(url) extractor = ArticleExtractor(html, url) content = extractor.content if extractor.pages: content = ArticleMerger( url, extractor.title, fetch_urls(extractor.pages, handle=get_or_cache), **extractor.selector).content res.append({ 'url': url, 'title': extractor.title, 'words': segmentor.seg(extractor.title, html2text(content, code=False)) }) return json.dumps(res)
def tests(): data = request.args.get('data', '') assert data is not None res, data = {}, eval(data) article = article_mongo.find_one({'_id':data['_id']}, {'content':1, 'title':1}) cursors = article_mongo.find({"_id":{"$in" : data['ids'].keys()}}, {'content':1, 'title':1}) for cursor in cursors: for i in xrange(1, 10): text_num1 = Simhash( default * i + article['title'] + html2text(article['content']) ).value - sys.maxint text_num2 = Simhash( default * i + cursor['title'] + html2text(cursor['content'])).value - sys.maxint text_num3 = Simhash(article['title'] + default * i + html2text(article['content']) ).value - sys.maxint text_num4 = Simhash(cursor['title'] + default * i + html2text(cursor['content'])).value - sys.maxint text_num5 = Simhash(article['title'] + html2text(article['content']) + default * i).value - sys.maxint text_num6 = Simhash(cursor['title'] + html2text(cursor['content']) + default * i).value - sys.maxint text_num7 = Simhash( default * i + article['title'] + default * i + html2text(article['content']) + default * i).value - sys.maxint text_num8 = Simhash( default * i + cursor['title'] + default * i + html2text(cursor['content']) + default * i).value - sys.maxint distance = mongo_spider.sim.distance res[i] = [distance(text_num1, text_num2), distance(text_num3, text_num4), distance(text_num5, text_num6), distance(text_num7, text_num8)] return render_template('test/tests.html', data=res)
def sendIssue(self, event): """ This sends selected issues(>=1) to DefectDojo be they selected from the DefectDojo Tab or the Context Menu in the Target Tab . Due to the current limitations in DefectDojo API request/response pairs cannot be added *yet* . """ checkMessage = self.checkSelection("test") if checkMessage: JOptionPane.showMessageDialog(self.getUiComponent().parent, checkMessage, "Error", JOptionPane.WARNING_MESSAGE) return if hasattr(self.ddui, 'userID'): pass else: self.getUserId() if event.getActionCommand() == 'Send To DefectDojo (Existing Test)': lgt = len(self.contextMenu._invoker.getSelectedIssues()) issues = self.contextMenu._invoker.getSelectedIssues() elif event.getActionCommand() == 'Send Issue': lgt = len(self.ddui._listTargetIss.getSelectedIndices()) issues = self.ddui._listTargetIss.getSelectedIndices() for i in range(lgt): ureqresp = [] if event.getActionCommand( ) == 'Send To DefectDojo (Existing Test)': title = issues[i].getIssueName() description = issues[i].getIssueDetail() if issues[ i].getIssueDetail() else issues[i].getIssueBackground() severity = issues[i].getSeverity() if severity == 'Information' or severity == 'Informational': severity = "Info" impact = issues[i].getIssueBackground() if issues[i].getRemediationBackground(): mitigation = issues[i].getRemediationBackground() + '\n' if issues[i].getRemediationDetail(): mitigation += issues[i].getRemediationDetail() else: mitigation = str(issues[i].getIssueType()) for mess in issues[i].getHttpMessages(): ureqresp.append({ "req": self._helpers.bytesToString(mess.getRequest()), "resp": self._helpers.bytesToString(mess.getResponse()) }) url = str(issues[i].getUrl()) description = html2text(description) impact = html2text(impact) mitigation = html2text(mitigation) try: json.loads(description) except: description = description.replace("\'", "") try: json.loads(impact) except: impact = impact.replace("\'", "") try: json.loads(mitigation) except: mitigation = mitigation.replace("\'", "") data = { 'title': title, 'description': description, 'severity': severity, 'product': '/api/v1/products/' + self._helpers.urlEncode(self.ddui.productID.getText()) + '/', 'engagement': '/api/v1/engagements/' + self._helpers.urlEncode(self.ddui.engagementID.getText()) + '/', 'reporter': '/api/v1/users/' + self._helpers.urlEncode(str(self.ddui.userID)) + '/', 'test': '/api/v1/tests/' + self._helpers.urlEncode(self.ddui.testID.getText()) + '/', 'impact': impact, 'active': True, 'verified': True, 'mitigation': mitigation, 'static_finding': False, 'dynamic_finding': False, 'file_path': url # 'steps_to_reproduce': ureqresp } data = json.dumps(data) self.checkUpdateSender() start_new_thread(self.sender.makeRequest, ('POST', '/api/v1/findings/', data)) message = str("Successfully imported (" + str(i + 1) + ") selected issue(s). Access Test : " + self.ddui.testID.getText()) link = str(self.ddui.defectDojoURL.getText() + "test/" + self.ddui.testID.getText()) linkDialog(message, link, JOptionPane, self.getUiComponent().parent)