def runBow(options): import csv from process import extractWords from detectors.bow.detectLSI import LsiDetector lsi=LsiDetector() if not os.path.exists('traces'): return with pushd('traces'): for tracedir in os.listdir('.'): if not os.path.isdir(tracedir): continue with pushd(tracedir): if not os.path.exists('tagged'): continue with pushd('tagged'): f=open('../../../detectors/bow/output.csv', 'wb') writer=csv.writer(f) writer.writerow(['file','decision','truth','correct']) for tag in os.listdir('.'): with pushd(tag): for streamfile in glob.glob('*.pcap'): words=extractWords(streamfile, []) newtag=lsi.classify(words) print(tag+' ?= '+newtag) writer.writerow([streamfile, newtag, tag, int(newtag==tag)]) f.close()
def trainBow(options): from process import extractWords, saveWords, extractCorpus, saveCorpus from util import changeExt from gensim import corpora, models, similarities from gensim.corpora.dictionary import Dictionary if not os.path.exists('detectors/bow/similarity.index'): if os.path.exists('detectors/bow/words.dict'): dict=corpora.dictionary.Dictionary('detectors/bow/words.dict') else: words=[] if not os.path.exists('traces'): return with pushd('traces'): for tracedir in os.listdir('.'): if not os.path.isdir(tracedir): continue with pushd(tracedir): if not os.path.exists('tagged'): continue with pushd('tagged'): for tag in os.listdir('.'): with pushd(tag): for streamfile in glob.glob('*.pcap'): print(streamfile) words=extractWords(streamfile, words) saveWords(words, 'detectors/bow/words.dict') if os.path.exists('detectors/bow/corpus.mm'): corpus=corpora.MmCorpus('detectors/bow/corpus.mm') else: corpus=[] tags=[] with pushd('traces'): for tracedir in os.listdir('.'): if not os.path.isdir(tracedir): continue with pushd(tracedir): if not os.path.exists('tagged'): continue with pushd('tagged'): for tag in os.listdir('.'): with pushd(tag): if tag in tags: i=tags.index(tag) else: i=len(tags) tags.append(tag) corpus.append([]) doc=[] for streamfile in glob.glob('*.pcap'): print(streamfile) doc=extractWords(streamfile, doc) corpus[i]=corpus[i]+doc for i in range(len(corpus)): corpus[i]=dict.doc2bow(corpus[i]) saveCorpus(corpus, 'detectors/bow/corpus.mm') f=open('detectors/bow/tags.json', 'w') f.write(json.dumps(tags)) f.close() lsi=models.LsiModel(corpus, num_topics=2) index=similarities.MatrixSimilarity(lsi[corpus]) index.save('detectors/bow/similarity.index')