def __init__(self, root): import config root, data_path, model_path, vector_path = config.get_paths() self.root = root self.data_path = data_path self.model_path = model_path self.vector_path = vector_path self.all_hashtags = [ "voetbal", "moslim", "werk", "economie", "jihad", "seks", "politiek" ] # self.all_vectors_store = pd.HDFStore(self.root + "w2v_vector.h5") self.balanced_store = pd.HDFStore(self.root + "datasets/seeds/balanced.h5") self.tweets = pd.read_csv(self.root + "datasets/data_sample.csv")
def partition_and_generate_distributions(index_name: str): configuration = config.get_paths() ix = index.open_dir(configuration[index_name], readonly=True) LOGGER.info('Index path: ' + configuration[index_name]) with ix.reader() as ix_reader: pa = pt.Partitioner(ix, ix_reader) print('Partitioner initiated!') parts = pa.generate([0.98, 0.1]) parts = [p for p in parts] print('Parts created!') print('naive1 ({}, {})'.format(parts[0].name, parts[1].name)) sol.generate_distance_distributions( cache=parts[0], disk=parts[1], save_path='/data/khodadaa/index/data', distance_type=['kld', 'avg-kld'])
# test if the amount of tokens has an influence on the nn performance # we saw that it doesnt work well when all tweet of varying amount of tokens is used. import pandas as pd import numpy as np import os, sys import features import config root, data_path, model_path, vector_path = config.get_paths() print root import pandas as pd import dataset dset = dataset.Dataset(root) # dset.create_subject_sets() ## Train a neural network on amount of tokens individually vectors = dset.all_vectors_store["data"] voetbal = pd.read_csv(root + "datasets/seeds/voetbal.csv") voetbal = pd.merge(voetbal, vectors, on="id") voetbal["labels"] = 0 jihad = pd.read_csv(root + "datasets/seeds/jihad.csv") jihad = pd.merge(jihad, vectors, on="id") jihad["labels"] = 1 #determine tokens rm_list = ["<stopword>", "<mention>", "<url>", "rt"] voetbal["ntokens"] = voetbal.filtered_text.apply( lambda x: len([a for a in x.split() if a not in rm_list])) jihad["ntokens"] = jihad.filtered_text.apply(
from time import sleep from config import get_paths from models import DDQNLearner, DDQNPlayer from utils import make_atari, wrap_deepmind, parse_args from utils import Logger, Plotter args = parse_args() # for arg in vars(args): # print(arg, getattr(args, arg)) ENV_NAME = args.env_name ENV_VER = args.env_version ENV_GYM = ENV_NAME + ENV_VER save_dirs = get_paths(drive=args.drive_save, env_name=ENV_NAME) PRINT_FREQ_EP = args.log_freq SAVE_MODEL_FREQ = args.save_freq LEARNING_START = args.learn_start logger = Logger(save_dirs=save_dirs, log_types=[], log_freq=args.log_freq, mode=args.mode) plotter = Plotter(save_dirs=save_dirs, plot_types=[ 'avg_scores_ep', 'avg_scores_ts', 'avg_scores_100_ep', 'avg_scores_100_ts', 'scores_ep', 'scores_ts', 'high_scores_ep', 'high_scores_ts', 'low_scores_ep', 'low_scores_ts', 'avg_loss_ep', 'avg_acc_ep',
with ix.searcher() as searcher: query = QueryParser(field_name, ix.schema).parse(user_query) facet = sorting.FieldFacet('count', reverse=True) results = searcher.search(query, sortedby=facet, limit=limit) print(results) for res in results: print('\n', res) if res.reader.has_vector(res.docnum, field_name): vgen = res.reader.vector_as('frequency', res.docnum, field_name) terms = [v for v in vgen] terms.sort(key=lambda tup: tup[1], reverse=True) print('Top terms: ', terms) else: print('0 term') if __name__ == '__main__': index_name = 'wiki13_index' limit = None if len(sys.argv) > 1: index_name = sys.argv[1] if len(sys.argv) > 2: limit = sys.argv[2] configuration = config.get_paths() user_query = 'public policy NOT \"public policy\"' while user_query != ':q': search(user_query, limit, configuration[index_name]) user_query = input('Query [:q to exit] : ')
prob_d_condit_q = prob_q_condit_d / norm prob += prob_t_condit_d * prob_d_condit_q return prob clt = 0.0 for t in vocabulary: if collection_tfs[t] == 0: collection_tfs[t] = 1 prob_t_condit_D = collection_tfs[t] / collection_total_terms prob_t_condit_Dq = get_prob_t_condition_Dq(t) clt += prob_t_condit_Dq * log(prob_t_condit_Dq / prob_t_condit_D) return clt if __name__ == '__main__': c = config.get_paths() index_path = c[sys.argv[1]] query_file_path = sys.argv[2] save_path = sys.argv[3] config.setup_logger('querydifficulty') ix = index.open_dir(index_path, readonly=True) LOGGER.info('Index path: ' + index_path) ix_reader = ix.reader() vocabulary = [] db_tfs = defaultdict(int) db_total_terms = 0 with open(c['db_tfs'], 'r') as fr: for line in fr: