def get_predict_keyword(keyword): ret = [] for word in get_headword(keyword): print word if word in iotools.load_keywords_dict()['all']: ret += tokens.network.keyword_neighbors(word) return sorted(ret, key=lambda x: x[1], reverse=True)
def get_all_keywords(): keywords = iotools.load_keywords_dict() keywords['all'] = OrderedDict( [item for item in sorted(keywords['all'].items(), key=lambda x: len(x[1]), reverse=False)]) for key in keywords['all']: keywords['all'][key] = list(set([dataset for dataset, _ in keywords['all'][key]])) dataset_ids = dict([(dataset['name'], index) for index, dataset in enumerate(iotools.load_datasets())]) return render_template('dataset-keywords.html', keywords=keywords, dataset_ids=dataset_ids)
def keyword_neighbors(keyword): keywords = iotools.load_keywords_dict() keyword_weight = weighting_function(keyword) related_datasets = set([x[0] for x in keywords['all'][keyword]]) ret = {} for dataset in related_datasets: dataset_keywords = iotools.load_dataset_keywords_dict(dataset)['all'] for keyword2 in dataset_keywords: ret[keyword2] = ret.get(keyword2, 0) + 1 for keyword2, val in ret.items(): weight = 1.0 * weighting_function(keyword2) * mylog(val) / keyword_weight / mylog(len(related_datasets)) ret[keyword2] = weight return sorted(ret.items(), key=lambda x: x[1], reverse=True)
def discovery(): keywords = list(iotools.load_keywords_dict()['all'].keys()) if 'keyword' in request.form: keyword = request.form.get('keyword', None) elif 'keyword' in request.args: keyword = request.args.get('keyword', None) else: keyword = None result = {} if keyword: result["stopword"] = process_keyword.get_stopword(keyword) result["headword"] = process_keyword.get_headword(keyword) result["sans"] = process_keyword.get_sans(keyword) result["sans_head"] = process_keyword.get_sans_head(keyword) result["symantec"] = process_keyword.get_symantec(keyword) result["symantec_head"] = process_keyword.get_symantec_head(keyword) result["predict"] = process_keyword.get_predict_keyword(keyword) result["capec"] = process_keyword.get_capec(keyword) result["capec_head"] = process_keyword.get_capec_head(keyword) return render_template("keyword-search.html", suggestions=keywords, keyword=keyword, result=result)
def tag_cloud_text_new_keywords_weighted(): ret = [] for keyword, klist in iotools.load_keywords_dict()['all'].items(): val = int(weighting_function(keyword)*1000) ret += [normalized(keyword)] * val return ret
def weight_all_keywords(): return weight_keywords(iotools.load_keywords_dict()['all'])
#! /usr/bin/env python -u # coding=utf-8 from math import log from utils import iotools __author__ = 'xl' keywords = iotools.load_keywords_dict() def weighting_function(keyword): global keywords keyword_freq = log(len(keywords['all'][keyword])+1, 2) weigth = 1/keyword_freq if keyword_freq > 0 else 0 return weigth def weight_keywords(keyword_list): ret = [] for keyword in keyword_list: ret.append((keyword, weighting_function(keyword))) return sorted(ret, key=lambda x: x[1], reverse=False) def dataset_weighting_function(dataset): keywords = iotools.load_dataset_keywords_dict(dataset['name'])['all'] keywords_weight = weight_keywords(keywords) return sum([x[1] for x in keywords_weight]) def weight_all_datasets():
def all_keywords_neighbors(): ret = {} for keyword in iotools.load_keywords_dict()['all']: ret[keyword] = keyword_neighbors(keyword) return sorted(ret.items(), key=lambda y: len([x[1] for x in y[1]]), reverse=True)