def preprocess(args): parser = argparse.ArgumentParser(description="args for preprocess") parser.add_argument("data_type", type=str) parser.add_argument("input", type=str) parser.add_argument("-o", "--out_dir", help="output dir", type=str, default="") parser.add_argument("-w", "--workers", help="number of workers", type=int, default=20) parser.add_argument("-d", "--debug", help="debug mode", action="store_true", default=False) parser.add_argument("-v", "--verbose", help="verbose mode", action="store_true", default=False) options = parser.parse_args(args) log_config = dict(name=__file__, debug=options.debug) out_dir = get_res_filepath(folder=options.out_dir) if options.verbose: log_config['console_verbosity'] = logging.INFO logger = init_log(**log_config) processor = get_processor(options.data_type, options.input, out_dir, logger, options.workers) processor.start()
def coverage(args): parser = argparse.ArgumentParser(description="args for coverage") parser.add_argument("-o", "--out_dir", help="output dir", type=str, default="") parser.add_argument("-d", "--debug", help="debug mode", action="store_true", default=False) parser.add_argument("-v", "--verbose", help="verbose mode", action="store_true", default=False) options = parser.parse_args(args) log_config = dict(name=__file__, debug=options.debug) out_dir = get_res_filepath(options.out_dir) if options.verbose: log_config['console_verbosity'] = logging.INFO logger = init_log(**log_config) Coverage(out_dir=out_dir, logger=logger).analyze()
def compare_impl(probability, occurrence, model, output): res = dict() logging.info("Start comparing...") start_ts = time.time() for row_idx, row in enumerate(probability): word = model.wv.index2word[row_idx] top_prob_indices = gensim.matutils.argsort(row, topn=40, reverse=True) top_occur_indices = gensim.matutils.argsort(occurrence[row_idx], topn=40, reverse=True) top_prediction = [(model.wv.index2word[index1], float(row[index1])) for index1 in top_prob_indices] top_occurrence = [(model.wv.index2word[index1], float(occurrence[row_idx][index1])) for index1 in top_occur_indices] res[word] = dict() res[word]['most_probable'] = top_prediction res[word]['most_occurred'] = top_occurrence # res[word]['bhattacharyya'] = bhattacharyya(row, occurrence[row_idx]) res[word]['cosine'] = cosine(row, occurrence[row_idx]) if row_idx == len(probability) - 1 or row_idx % 100 == 0: current_ts = time.time() logging.info( "Processed_words: {:d} Progress: {:.02%} Words/sec: {:.02f}". format(row_idx, row_idx / len(probability), row_idx / (current_ts - start_ts))) outfile = get_res_filepath(output) json.dump(res, open(outfile, 'w'), indent=2) logging.info("Job finished, results saved at '{}'".format(outfile))
def get_text_stats(args): parser = argparse.ArgumentParser(description="args for preprocess") parser.add_argument("data_type", choices=["reddit", "hackforums", "darkode", "nulled"], type=str) parser.add_argument("-o", "--out_dir", help="output dir", type=str, default="") parser.add_argument("-d", "--debug", help="debug mode", action="store_true", default=False) parser.add_argument("-v", "--verbose", help="verbose mode", action="store_true", default=False) options = parser.parse_args(args) log_config = dict(name=__file__, debug=options.debug) out_dir = get_res_filepath(options.out_dir) if options.verbose: log_config['console_verbosity'] = logging.INFO logger = init_log(**log_config) TextStats(data_type=options.data_type, out_dir=out_dir, logger=logger).analyze()
def get_coverage(white, dark): white_set = white.keys() dark_set = dark.keys() common_set = white_set & dark_set coverage = len(common_set) / len(dark_set) dark_total = sum(dark.values()) common_total = sum([dark[x] for x in common_set]) print("common unique words coverage: {:.2%} ({}/{})".format( coverage, len(common_set), len(dark_set))) print("common words coverage: {:.2%} ({}/{})".format( (common_total / dark_total), common_total, dark_total)) missed_words = {x: dark[x] for x in (dark_set - white_set)} outfile = get_res_filepath("missed_words.json") with open(outfile, 'w') as fd: json.dump(missed_words, fd, indent=2) print(outfile) common_words = {x: dark[x] for x in common_set} outfile = get_res_filepath("common_words.json") with open(outfile, 'w') as fd: json.dump(common_words, fd, indent=2) print(outfile)
def stats(args): global logger parser = argparse.ArgumentParser(description="args for parse_annotated") parser.add_argument("-a", "--annotations", help="annotations dir", type=str, default="annotations.json") parser.add_argument("-m", "--model", help="model name", type=str, default="forums.it100") parser.add_argument("-o", "--out_file", help="output dir", type=str, default="stats") # parser.add_argument( # "-w", "--workers", help="number of workers", type=int, default=10) parser.add_argument("-d", "--debug", help="debug mode", action="store_true", default=False) parser.add_argument("-s", "--sentence", help="output sentence", action="store_true", default=False) parser.add_argument("-v", "--verbose", help="verbose mode", action="store_true", default=False) options = parser.parse_args(args) log_config = dict(name=__file__, debug=options.debug) if options.verbose: log_config['console_verbosity'] = logging.INFO logger = init_log(**log_config) annotations = json.load(open(get_res_filepath(fn=options.annotations))) stats_impl(annotations=annotations, model=options.model, out_file=options.out_file, sen=options.sentence)
def build_vocab_impl(input, output, min_count): # print(options) # print(type(options)) if input and os.path.isfile(input): sentences = LineSentence(input, max_sentence_length=10000) else: print("Error: input file '{}' not found".format(input)) return 1 outfile = get_res_filepath(fn=output) # -cbow 0 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 0 -iter 15 model = gensim.models.Word2Vec(min_count=min_count) model.build_vocab(sentences=sentences) model.save(outfile)
def prepare(args): parser = argparse.ArgumentParser(description="args for prepare") parser.add_argument( "-i", "--in_dir", help="input dir", type=str, default="") parser.add_argument( "-o", "--out_dir", help="output dir", type=str, default="") parser.add_argument( "-t", "--workers", help="number of workers", type=int, default=10) parser.add_argument( "-f", '--forums', nargs='+', required=True, choices=allforums, help='specifies target forum(s)') parser.add_argument( "-d", "--debug", help="debug mode", action="store_true", default=False) parser.add_argument( "-v", "--verbose", help="verbose mode", action="store_true", default=False) options = parser.parse_args(args) selections = [f for f in options.forums if f in allforums] log_config = dict(name=__file__, debug=options.debug) out_dir = get_res_filepath(folder=os.path.join('text2data', options.out_dir)) in_dir = os.path.join(PREPROCESSED_DIR, options.in_dir) if options.verbose: log_config['console_verbosity'] = logging.INFO logger = init_log(**log_config) TrainingPrepare( in_dir=in_dir, out_dir=out_dir, logger=logger, forums=selections, workers=options.workers).go()
def predict_impl(good_model, bad_model, output): logging.info("start calculating probability") if not good_model.negative or not bad_model.negative: raise RuntimeError( "We have currently only implemented predict_output_word for the negative sampling scheme, " "so you need to have run word2vec with negative > 0 for this to work." ) if not hasattr(bad_model.wv, 'syn0') or not hasattr(good_model, 'syn1neg'): raise RuntimeError( "Parameters required for predicting the output words not found.") syn0 = bad_model.wv.syn0 syn1 = good_model.syn1neg probability = exp(dot(syn0, syn1.T)) rows, columns = probability.shape logging.info("probability matrix rows: {}, columns: {}".format( rows, columns)) sums = np_sum(probability, axis=1) logging.info("probability sum matrix shape: {}".format(sums.shape)) probability = probability / sums[:, None] logging.info("probability calculation finished") pred_outfile = get_res_filepath(fn="{}.prob.npy".format(output)) t1 = Thread(target=save, args=(pred_outfile, probability)) t1.start() # logging.info("start occurrence counting") # occurrence = zeros((rows, rows)) # # TODO # logging.info("occurrence counting finished") # occur_outfile = get_res_filepath(fn="{}.occur.npy".format(output)) # t2 = Thread(target=save, args=(occur_outfile, occurrence)) # save(occur_outfile, occurrence) # compare_outfile = get_res_filepath(fn="{}.compare.json".format(output)) # compare_impl(probability, occurrence, bad_model, compare_outfile) t1.join() logging.info("prediction results saved at '{}'".format(pred_outfile))
def compare_pred_impl(p1, p2, p3, model, output, threads_n): progress = AtomicCounter() res = dict() logging.info("Start comparing...") start_ts = time.time() threads = [] batch = math.ceil(len(p1) / threads_n) for i in range(threads_n): t = Thread(target=compare_pred_thread, args=(res, p1, p2, p3, model, batch * i, batch + batch * i, start_ts, progress)) t.start() threads.append(t) for t in threads: t.join() corrs = [x['correlation'] for x in res.values()] pvs = [x['pvalue'] for x in res.values()] jac40 = list(zip(*[x['jac40'] for x in res.values()])) jac100 = list(zip(*[x['jac100'] for x in res.values()])) jac1000 = list(zip(*[x['jac1000'] for x in res.values()])) prob_std1 = [x['prob_std1'] for x in res.values()] prob_std2 = [x['prob_std2'] for x in res.values()] prob_std3 = [x['prob_std3'] for x in res.values()] stats = dict() stats["correlation"] = stat_dict(corrs) stats["pvalue"] = stat_dict(pvs) stats["jac40"] = stat_dict(jac40[0]), stat_dict(jac40[1]) stats["jac100"] = stat_dict(jac100[0]), stat_dict(jac100[1]) stats["jac1000"] = stat_dict(jac1000[0]), stat_dict(jac1000[1]) stats["prob_std1"] = stat_dict(prob_std1) stats["prob_std2"] = stat_dict(prob_std2) stats["prob_std3"] = stat_dict(prob_std3) outfile = get_res_filepath(output) json.dump(dict(stats=stats, details=res), open(outfile, 'w'), indent=2) logging.info("Job finished, results saved at '{}'".format(outfile))
def parse_annotated(args): global logger parser = argparse.ArgumentParser(description="args for parse_annotated") parser.add_argument("-i", "--in_dir", help="input dir", type=str, default="") parser.add_argument("-o", "--out_file", help="output dir", type=str, default="annotations.json") parser.add_argument("-w", "--workers", help="number of workers", type=int, default=10) parser.add_argument("-d", "--debug", help="debug mode", action="store_true", default=False) parser.add_argument("-v", "--verbose", help="verbose mode", action="store_true", default=False) options = parser.parse_args(args) log_config = dict(name=__file__, debug=options.debug) out_file = get_res_filepath(fn=options.out_file) in_dir = os.path.join(PREPROCESSED_DIR, options.in_dir) if options.verbose: log_config['console_verbosity'] = logging.INFO logger = init_log(**log_config) parse_annotated_impl(in_dir=in_dir, out_file=out_file)
def train(args): # log_config = dict(name=__file__, console_verbosity=logging.INFO) # logger = init_log(**log_config) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) text8 = gensim.models.word2vec.Text8Corpus( "/u/kanyuan/text8/text8_nonstop", max_sentence_length=10000) sentences = list(text8) outfile = get_res_filepath(fn="text8_nonstop.model_1") # -cbow 0 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 0 -iter 15 model = gensim.models.Word2Vec(sentences, workers=20, window=10, negative=25, sg=1, size=200, sample=0.0001, iter=15, compute_loss=True) model.save(outfile)
import sqlite3 import traceback from multiprocessing.pool import ThreadPool from collections import Counter from nltk.tokenize import sent_tokenize # from gensim.utils import tokenize from ..utils.misc import tokenize from monster.misc import get_res_filepath from monster.log import init_log from monster.atomic import AtomicCounter from .preprocessor import pattern_url, pattern_email, pattern_hash, en_stopwords PREPROCESSED_DIR = os.path.abspath( os.path.join(get_res_filepath(), os.pardir, "preprocessing")) allforums = ["darkode", "hackforums", "nulled", "silkroad"] frequent_bar = 10 code_indicators = "(){}[].;\"" class TrainingPrepare(object): def __init__(self, in_dir, out_dir, logger, forums, workers): self.logger = logger self.out_dir = out_dir self.db_dir = in_dir self.selections = forums self.logger.info("Init finished") self.pool = ThreadPool(processes=workers)
import json import argparse import os import logging import gensim import time import math import numpy as np from threading import Thread from scipy.stats import spearmanr from monster.misc import get_res_filepath from monster.atomic import AtomicCounter DATA_DIR = os.path.abspath( os.path.join(get_res_filepath(), os.pardir, "compare 2 predictions")) def compare_pred(args): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) parser = argparse.ArgumentParser(description="args for prediction") parser.add_argument("--prob1", help="prob1 file", type=str) parser.add_argument("--prob2", help="prob2 file", type=str) parser.add_argument("--prob3", help="prob3 file", type=str) parser.add_argument("-m", "--model", help="eithor good or bad model", type=str) parser.add_argument("-t",
def prepare(args): parser = argparse.ArgumentParser(description="args for prepare") parser.add_argument("-i", "--in_dir", help="input dir", type=str, default="") parser.add_argument("-o", "--out_dir", help="output dir", type=str, default="") parser.add_argument("-t", "--workers", help="number of workers", type=int, default=10) parser.add_argument("-f", '--forums', nargs='+', required=True, choices=all_dark_forums + all_white_forums, help='specifies target forum(s)') parser.add_argument("-d", "--debug", help="debug mode", action="store_true", default=False) parser.add_argument("-v", "--verbose", help="verbose mode", action="store_true", default=False) options = parser.parse_args(args) dark_selections = [] white_selections = [] wiki = False for choice in options.forums: if choice == "wiki": wiki = True elif choice in all_dark_forums: dark_selections.append(choice) elif choice in all_white_forums: white_selections.append(choice) log_config = dict(name=__file__, debug=options.debug) out_dir = get_res_filepath(folder=options.out_dir) in_dir = os.path.join(PREPROCESSED_DIR, options.in_dir) if options.verbose: log_config['console_verbosity'] = logging.INFO logger = init_log(**log_config) TrainingPrepare(in_dir=in_dir, out_dir=out_dir, logger=logger, dark=dark_selections, white=white_selections, wiki=wiki, workers=options.workers).go()
def __init__(self, out_dir, logger): self.logger = logger self.out_dir = out_dir self.vocab_dir = get_res_filepath() self.dark_vocabs = dict() self.white_vocabs = dict()
def train(args): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) parser = argparse.ArgumentParser(description="args for training") parser.add_argument("-g", "--good", help="good corpus file", type=str) parser.add_argument("-b", "--bad", help="bad corpus file", type=str) parser.add_argument("-o", "--output", help="output filename", type=str, default="word2vec.model") parser.add_argument("-s", "--size", help="word vector size", type=int, default=100) parser.add_argument("-a", "--alpha", help="initial learning rate", type=float, default=0.025) parser.add_argument("-w", "--window", help="window size", type=int, default=5) parser.add_argument("-S", "--sample", help="subsampling rate", type=float, default=1e-3) parser.add_argument("-T", "--threads", help="thread number", type=int, default=3) parser.add_argument("--min_alpha", help="minimal learning rate", type=float, default=0.0001) parser.add_argument("--sg", help="skip gram (1) or cbow (0)", type=int, default=0) parser.add_argument("--hs", help="using hierarchical softmax (1) or not (0)", type=int, default=0) parser.add_argument("-n", "--negative", help="negative sampling", type=int, default=5) parser.add_argument("--cbow_mean", help="cbow mean", type=int, default=1) parser.add_argument("-i", "--iter", help="iterations", type=int, default=5) parser.add_argument("--min_count", help="minimal occurrence of words to be considered", type=int, default=5) options = options = parser.parse_args(args) vocab = dict() logging.info("loading corpus...") if options.good and os.path.isfile(options.good): good_sentences = list( LineSentence(options.good, max_sentence_length=10000)) vocab['good'] = get_vocab(options.good) else: logging.error("Error: good corpus file '{}' not found".format( options.good)) return 1 if options.bad and os.path.isfile(options.bad): bad_sentences = list( LineSentence(options.bad, max_sentence_length=10000)) vocab['bad'] = get_vocab(options.bad) else: bad_sentences = list() min_count = options.min_count good_outfile = get_res_filepath(fn="{}.good.model".format(options.output)) bad_outfile = get_res_filepath(fn="{}.bad.model".format(options.output)) vocab_outfile = get_res_filepath(fn="{}.vocab".format(options.output)) with open(vocab_outfile, "w") as fd: json.dump(vocab, fd) good_model = gensim.models.Word2Vec( workers=options.threads, window=options.window, negative=options.negative, sg=options.sg, size=options.size, sample=options.sample, min_count=min_count, iter=options.iter, alpha=options.alpha, min_alpha=options.min_alpha, hs=options.hs, cbow_mean=options.cbow_mean, ) good_model.build_vocab(good_sentences + bad_sentences) good_model.train(good_sentences, total_examples=len(good_sentences), epochs=good_model.iter) good_model.save(good_outfile) if bad_sentences: bad_model = gensim.models.Word2Vec( workers=options.threads, window=options.window, negative=options.negative, sg=options.sg, size=options.size, sample=options.sample, min_count=min_count, iter=options.iter, alpha=options.alpha, min_alpha=options.min_alpha, hs=options.hs, cbow_mean=options.cbow_mean, ) bad_model.build_vocab(good_sentences + bad_sentences) bad_model.train(bad_sentences, total_examples=len(bad_sentences), epochs=bad_model.iter) bad_model.save(bad_outfile)
from gensim.matutils import argsort, unitvec from gensim.utils import tokenize as tokenize1 from ..utils.misc import tokenize as tokenize2 from collections import defaultdict from multiprocessing.pool import ThreadPool from collections import Counter from nltk.tokenize import sent_tokenize # from gensim.utils import tokenize from monster.misc import get_res_filepath from monster.log import init_log PREPROCESSED_DIR = os.path.abspath( os.path.join(get_res_filepath(), os.pardir, "preprocessing")) MODEL_DIR = os.path.abspath( os.path.join(get_res_filepath(), os.pardir, os.pardir, "word2vec/jargon")) tokenize = tokenize2 def load_cmodel(model_fn): model = dict() with open(model_fn) as fd: rows, columns = map(int, fd.readline().split()) for idx, line in enumerate(fd): line = line.strip() fields = line.split() if (len(fields) != columns + 1): logging.error("malformatted model file")
def stats_impl(annotations, model, out_file, sen): out_file = get_res_filepath(fn="{}.csv".format(out_file)) out_file2 = get_res_filepath(fn="{}_missed.csv".format(out_file)) logger.info("init finished") cands = list(get_candidates(model)) words = [x[1] for x in cands] if sen: logger.info("start preparing data") db_dark = prepare_dark(words) logger.info("dark data preparing finished") db_white = prepare_white(words) logger.info("white data preparing finished") else: db_dark = dict() db_white = dict() csvfd = open(out_file, 'w') spamwriter = csv.writer(csvfd) for rank, word, sim, good_interpretation, bad_interpretation, normal_interpretation in cands: logger.info("processing word '{}''".format(word)) good_sen = "...... ".join(db_white.get(word, [])) bad_sen = "...... ".join(db_dark.get(word, [])) good_interpretation = [x[0] for x in good_interpretation] bad_interpretation = [x[0] for x in bad_interpretation] normal_interpretation = [x[0] for x in normal_interpretation] if word in annotations: labeled = True label = label2str(annotations[word]) else: labeled = False label = "" spamwriter.writerow([ rank, word, good_interpretation, bad_interpretation, bad_sen, normal_interpretation, good_sen, labeled, label ]) csvfd.close() csvfd = open(out_file2, 'w') spamwriter = csv.writer(csvfd) spamwriter.writerow([ "word", "score", "gcn", "bcn", "good_interpretation", "bad_interpretation", "bad_sen", "normal_interpretation", "good_sen", "labeled", "label" ]) for word in annotations: if word not in words: sim, gcn, bcn, good_interpretation, bad_interpretation, normal_interpretation = get_info( word) logger.info("processing missing word '{}''".format(word)) # good_sen = "...... ".join(db_white.get(word, [])) # bad_sen = "...... ".join(db_dark.get(word, [])) good_interpretation = [x[0] for x in good_interpretation] bad_interpretation = [x[0] for x in bad_interpretation] normal_interpretation = [x[0] for x in normal_interpretation] if word in annotations: labeled = True label = label2str(annotations[word]) else: labeled = False label = "" spamwriter.writerow([ word, sim, gcn, bcn, good_interpretation, bad_interpretation, bad_sen, normal_interpretation, good_sen, labeled, label ]) csvfd.close() logger.info("data saved at '{}' and '{}'".format(out_file, out_file2))