def scrape_date(soup): """ Scrapes the publication date of a website using time HTML tag. """ log("Scraping publication date.") # This is a backup in the case of Wayback not having a record pub_date = soup.find('time') if pub_date is None: return None pub_date = pub_date.get('datetime') pub_date = iso_to_date(pub_date) # Get rid of timezone info pub_date = pub_date.replace(tzinfo=None) return pub_date
def __init__(self, develop_mode: bool): """ :param develop_mode: mode for force debugging :type <bool> """ self.message_forward_data = dict() self.__develop_mode = develop_mode self.logger = log('forward', 'forward.log', 'INFO')
def validate(xgcn, dataloader, device): """Determines performance of xgcn model on the data provided by dataloader.""" log('Validating...') xgcn.eval() xgcn.to(device) outputs = None targets = None for (embeddings, adjacencies, labels) in tqdm(dataloader): embeddings = embeddings.to(device) adjacencies = adjacencies.to(device) labels = labels.to(device) if targets is None: targets = labels else: targets = torch.cat((targets, labels)) output = xgcn(embeddings, adjacencies) output = torch.argmax(output, dim=1) if outputs is None: outputs = output else: outputs = torch.cat((outputs, output)) outputs = outputs.tolist() targets = targets.tolist() outputs, targets = zip(*((output, target) for output, target in zip(outputs, targets) )) # todo what does this do? outputs = list(outputs) targets = list(targets) f_score_micro = f1_score(y_pred=outputs, y_true=targets, average='micro') f_score_macro = f1_score(y_pred=outputs, y_true=targets, average='macro') f_score_weighted = f1_score(y_pred=outputs, y_true=targets, average='weighted') log('...done validating.') return { 'micro': f_score_micro, 'macro': f_score_macro, 'weighted': f_score_weighted }
def __init__(self, is_get_data: bool = True): self.admins_list: list = [] self.users_ban_list: list = [] self.users_count: int = 0 self.isClear: bool = True self.isUpdate: bool = False self.data: dict = {} self.logger = log('Cache', 'cache.log', 'INFO') if is_get_data: self.get_data()
def send_birthday_emails(request): tz = pytz.timezone(TIME_ZONE) now = datetime.datetime.now(tz) todays_birthday_persons = Person.objects\ .filter(birth_date__month=now.month, birth_date__day=now.day)\ .all() if todays_birthday_persons.count() == 0: return HttpResponse('no birthdays today.') else: admin = EmailMaster.objects.first() msg = '' for person in todays_birthday_persons: if person.last_birthday_email_sent_on_year < now.year: data = get_template_params(person) template = render_to_string('polls/happy_birthday_email.html', data) send_email(admin.email, person.email, 'Happy birthday!', template) send_email( admin.email, admin.email, 'BIRTHDAY: {} on {}'.format(person, person.birth_date), '') person.last_birthday_email_sent_on_year = now.year person.save() msg += 'Happy Birthday, {}! '.format(person) log('Birthday email for {} was sent at {}. '.format( person, now)) else: msg += 'Birthday email for a person was already sent. <br />'.format( person) return HttpResponse(msg)
def to_latex(path_in, path_out, weight, base, max_seq_len=-1, crop=-1): all_explanations = [] with open(path_in, 'r') as fin: for jsonl in tqdm(fin): # early stopping (consider tex out-of-resources error) if 0 < crop < len(all_explanations): log('Reached max number of explanations.') break jsonl = ujson.loads(jsonl) nodes = jsonl['graph']['nodes'] if max_seq_len > 0 and len(nodes) > max_seq_len: log(f"Skipping line because max seq length exceeded.") continue edges = jsonl['graph']['edges'] label_true = jsonl['label'] label_pred = jsonl['prediction']['label'] graph = XGraph() for node in nodes: graph.add_node(XNode(id=node['id'], label=node['label'], type='TOKEN')) for edge in edges: graph.add_edge(graph.get_node(edge['source']), graph.get_node(edge['target']), t=edge['type']) # collect the relevance flow through the layers relevance_flow = jsonl['relevance_flow'] _, explanations = normalize_explanations(graph=graph, relevance_flow=relevance_flow, true_label=label_true, predicted_label=label_pred) all_explanations = all_explanations + explanations latex = explanations_to_latex(explanations=all_explanations, weight=weight, base=base) with open(path_out, 'w') as fout: fout.write(latex)
def cb(sender, instance, *args, **kwargs): log('Saving {}'.format(str(instance))) if instance.hero is not None and instance.hero.gender != instance.gender: log('Hero is of the wrong gender. Removing hero.') instance.hero = None if instance.hero is None: log('Assigning a random hero.') instance.hero = get_random_superhero(instance.gender)
def preprocess_pubmed(path, to_lower, language_model): """Prepocesses PubMed file into list of XGraph objects.""" pattern = "###[0-9]+$" pattern = re.compile(pattern) # retrieve output path from input path so that there is no manual mixup of the file names path_out = path.replace('.txt', '.p') # iath.split('.')[:-1][0] + ".p" f_in = open(path, 'r') lines = f_in.readlines() graphs = [] nlp = spacy.load(language_model, disable=[ 'tagger', 'ner', 'textcat', 'entity_ruler', 'sentenizer', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens' ]) written = 0 discarded = 0 for line in lines: line = line.strip() if len(line) == 0 or pattern.match(line.strip()): discarded = discarded + 1 continue label, graph = line_to_graph(line.strip(), nlp, to_lower=to_lower) graphs.append((label, graph)) written = written + 1 if written % 1000 == 999: log('Processed {} lines'.format(written + 1)) f_in.close() log("Wrote {} graphs from {} to {}, discarded {} lines.".format( written, path, path_out, discarded)) log("Pickling to {}...".format(path_out)) pickle.dump(graphs, open(path_out, 'wb')) log("...done pickling.") return path_out
def train(loader_train, loader_dev, path_model, epochs, batch_size, pad, nfeat, nhid, patience, metric, random_seed): """Trains an GCN.""" nclasses = 5 assert metric in ["weighted", "macro", "micro"] device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') if torch.cuda.is_available(): torch.cuda.manual_seed_all(random_seed) log("Set random cuda seed to {}".format(random_seed)) else: torch.manual_seed(random_seed) log("Set manual seed to {}".format(random_seed)) log("Training on device {}".format(device)) xgcn = XGCN(nfeat=nfeat, nhid=nhid, nclass=nclasses, pad=pad, bias=None) xgcn.to(device) print(xgcn) optimizer = Adam(params=xgcn.parameters()) # todo pass as argument scores = validate(xgcn=xgcn, dataloader=loader_dev, device=device) report(epoch=0, split="Dev", scores=scores) torch.save(xgcn.state_dict(), path_model) log("Saved initial model to {}.".format(path_model)) wait = 0 score_last = float('-inf') running_loss = 0.0 for epoch in range(epochs): xgcn.train() for batch_idx, (embeddings, adjacencies, labels) in enumerate(loader_train): embeddings = embeddings.to(device) adjacencies = adjacencies.to(device) labels = labels.to(device) optimizer.zero_grad() preds = xgcn(embeddings, adjacencies) loss = F.nll_loss(preds, labels) loss.backward() optimizer.step() xgcn.xfc.weight.data.clamp_(0) # print statistics running_loss += loss.item() if batch_idx % 10 == 9: log('[%d, %5d, %5d] loss: %.3f' % (epoch + 1, batch_idx + 1, (batch_idx + 1) * batch_size, running_loss / 10)) running_loss = 0.0 scores = validate(xgcn=xgcn, dataloader=loader_dev, device=device) report(epoch=epoch + 1, split="Dev", scores=scores) score_current = scores[metric] if score_current > score_last: torch.save(xgcn.state_dict(), path_model) log("{} score improved from {:.3f} to {:.3f}. Saved model to {}.". format(metric, score_last, score_current, path_model)) score_last = score_current wait = 0 else: wait = wait + 1 if wait >= patience: log("Terminating training after {} epochs w/o improvement.". format(wait)) return xgcn
def report(epoch, split, scores): log("Epoch: {} Split: {} F-micro: {:.3f} F-macro: {:.3f} F-weighted: {:.3f}" .format(epoch, split, scores['micro'], scores['macro'], scores['weighted']))
if score_current > score_last: torch.save(xgcn.state_dict(), path_model) log("{} score improved from {:.3f} to {:.3f}. Saved model to {}.". format(metric, score_last, score_current, path_model)) score_last = score_current wait = 0 else: wait = wait + 1 if wait >= patience: log("Terminating training after {} epochs w/o improvement.". format(wait)) return xgcn if __name__ == "__main__": log('Training...') cfg = config('./config.json') print(json.dumps(cfg, indent=2)) parser = argparse.ArgumentParser() parser.add_argument( '--file_train_pickle', type=str, default=cfg['preprocessing']['pubmed']['file_train_pickle']) parser.add_argument( '--file_dev_pickle', type=str, default=cfg['preprocessing']['pubmed']['file_dev_pickle']) parser.add_argument( '--file_test_pickle',
def explain(nfeat, nhid, padding, path_model, path_text, path_out, path_label2vec, lower_bound, upper_bound, language_model, to_lower, crop, do_occlude, drop=None, step=None, verbose=True): if do_occlude: assert drop is not None, 'Define drop range.' assert step is not None, 'Define step size.' # assert 0 < drop <= 1 - step, 'Drop range or step size outside of valid scope.' if crop > 0: warnings.warn("Cropping dataset.") assert lower_bound <= upper_bound, 'Lower bound greater than upper bound' CLASSES = PubMedDataset.classes() nclasses = len(CLASSES) # declare model xgcn = XGCN(nfeat, nhid, nclasses, padding, None) # load weights, assume model was trained on a GPU but when loading, map to current location device = 'cpu' if not torch.cuda.is_available() else 'cuda:0' xgcn.load_state_dict(torch.load(path_model, map_location=device)) # pubmed data specific pattern used to identify lines that contain no data point pattern = "###[0-9]+$" pattern = re.compile(pattern) # spacy creates the dependency tree nlp = spacy.load(language_model) # label2vec dictionary, used to map node labels (i.e. tokens) onto embeddings label2vec = pickle.load(open(path_label2vec, 'rb')) # all graph embeddings need to be of the same size pad = Pad(padding=padding) with open(path_text, 'r') as fin: # lines processed line_counter = 0 # each json line is self contained at the price of redundancy with open(path_out, 'w+') as fout: for line in tqdm(fin.readlines()): # if line contains no data point, skip if pattern.match(line) or len(line.strip()) == 0: continue line_counter = line_counter + 1 if 0 <= crop <= line_counter: log(f"Terminating after {line_counter} lines, due to crop of {crop}." ) break # disable dropouts xgcn.eval() # cache inputs during forward pass xgcn.set_explainable(True) # declare json line jsonl = dict() jsonl['line'] = line_counter # save config jsonl['padding'] = padding # save vocabulary path jsonl['label2vec'] = path_label2vec # save model state jsonl['model'] = {} jsonl['model']['path'] = path_model jsonl['model']['device'] = device jsonl['model']['architecture'] = xgcn.__repr__().strip( ).replace(os.linesep, '') # save raw text jsonl['text'] = line.strip() # retrieve label and graph, save label, graph = line_to_graph(line=line, nlp=nlp, to_lower=to_lower) graph_json = graph.to_json() jsonl['graph'] = graph_json jsonl['label'] = label # perform forward pass, save resulting tensor jsonl['prediction'] = dict() e, a = graph.E(label2vec=label2vec), graph.A_tilde() x_sample = XSample(embedding=e, adjacency=a) x_sample = pad(x_sample) x_sample.to_tensor() e, a = x_sample.EMBEDDING, x_sample.ADJACENCY # since xgcn is in explainable mode, softmax layer should be deactivated pred_tensor = xgcn(embedding=e, adjacency=a) jsonl['prediction']['tensor'] = tensor_to_list(pred_tensor) # save label max_idx = pred_tensor.argmax() if pred_tensor[0][max_idx] <= 0: warnings.warn( f'Maximum output of GCN is <=0 (line {line_counter}), will ignore this data point.' ) continue pred_label = CLASSES[max_idx.item()] jsonl['prediction']['label'] = pred_label # perform layerwise relevance propagation, save layerwise relevance R = torch.zeros_like(pred_tensor) R[0][max_idx] = pred_tensor[0][max_idx] _, relevance_flow = xgcn.relprop(R, lower_bound=lower_bound, higher_bound=upper_bound) jsonl['relevance_flow'] = relevance_flow if do_occlude: jsonl = occlude(graph=graph, jsonl=jsonl, xgcn=xgcn, adjacency=a, embedding=e, drop=drop, step=step, padding=padding, verbose=verbose, line_counter=line_counter) fout.write((ujson.dumps(jsonl) + os.linesep)) fout.flush() fout.close() return True
log('Processed {} lines'.format(written + 1)) f_in.close() log("Wrote {} graphs from {} to {}, discarded {} lines.".format( written, path, path_out, discarded)) log("Pickling to {}...".format(path_out)) pickle.dump(graphs, open(path_out, 'wb')) log("...done pickling.") return path_out if __name__ == '__main__': log('Preprocessing...') cfg = config('./config.json') print(json.dumps(cfg, indent=2)) parser = argparse.ArgumentParser() parser.add_argument('--preprocess_wordvectors', type=bool, default=cfg['preprocessing']['word_vectors']['doit']) parser.add_argument( '--vocab_size', type=int, default=cfg['preprocessing']['word_vectors']['vocab_size']) parser.add_argument( '--file_word2vec', type=str,
from bot import create_bot_instance, hidden_forward, bot_cache from helpers.log import log import threading import telebot import flask from time import sleep from config import * logger_main = log('main', 'main.log', 'WARNING') def update_cache(timeout: int): try: while True: sleep(timeout) bot_cache.update_cache() hidden_forward.clear_data() except Exception as err: logger_main.warning(err.with_traceback(None)) def flask_init(bot_object): web_hook_app = flask.Flask(__name__) url_path = f"/{TOKEN}/" @web_hook_app.route('/', methods=['GET', 'HEAD']) def index(): return '' @web_hook_app.route(url_path, methods=['POST'])
embedding=e, drop=drop, step=step, padding=padding, verbose=verbose, line_counter=line_counter) fout.write((ujson.dumps(jsonl) + os.linesep)) fout.flush() fout.close() return True if __name__ == '__main__': log('Explaining...') cfg = config('./config.json') print(ujson.dumps(cfg, indent=2)) parser = argparse.ArgumentParser() parser.add_argument('--nfeat', type=int, default=cfg['training']['nfeat']) parser.add_argument('--nhid', type=int, default=cfg['training']['nhid']) parser.add_argument('--path_model', type=str, default=cfg['training']['path_model']) parser.add_argument('--pad', type=int, default=cfg['training']['pad']) parser.add_argument( '--file_test_text', type=str,
def occlude(graph, jsonl, xgcn, adjacency, embedding, drop, step, padding, verbose, line_counter): """Masks most and least relevant edges and tests model performance with masked adjancency matrix.""" xgcn.eval() xgcn.set_explainable(True) CLASSES = PubMedDataset.classes() relevance_flow = jsonl['relevance_flow'] # normalize relevances layerwise, save in Explanation objects graph, explanations = normalize_explanations(graph, relevance_flow) # for the first and second layer, determine how much relevance mass was carried by each edge during LRP def explanations_to_relevance_matrices(explanations): for explanation in explanations: if explanation.relevances_prior is not None: graph = copy.deepcopy(explanation.graph) rel_matrix = relevance_matrix( graph=graph, relevances_prior=explanation.relevances_prior, relevances_now=explanation.relevances) yield rel_matrix # note: relevance matrices in row->col edge direction relevance_matrices = explanations_to_relevance_matrices(explanations) relevance_matrices = list(relevance_matrices) assert len(relevance_matrices) == 2, 'Sanity check failed.' # normalize relevance matrices along layer dimensions, note: in row-col direction global_normalized_relevance_matrix = ( relevance_matrices[0] + relevance_matrices[1]) / np.sum( np.sum(relevance_matrices[0] + relevance_matrices[1])) if not (np.isclose(np.sum(np.sum(global_normalized_relevance_matrix)), 1.0)): warnings.warn( f"After normalization sum of weights not close to 1 in line {line_counter}." ) # now retrieve the edge relevances and their positions in the adjacency matrix, note: row->col edge direction edge_relevances_and_positions = [] for edge in graph.edges: position = (edge[0].id - 1, edge[1].id - 1) weight = global_normalized_relevance_matrix[position[0]][position[1]] edge_relevances_and_positions.append((weight, position)) # the following are the edges - their positions - ordered by the relevance they carried, note: row->col direction edge_weights_and_positions = sorted(edge_relevances_and_positions, key=lambda tup: tup[0], reverse=True) # now occlude and recored performance last_drop = None for ratio in np.arange(0., (drop + step), step=step): assert 0 <= ratio <= 1.0, 'Ratio out of range.' # mask for top k edges, note: in row->col direction mask_top, dropped_edges = get_mask( matrix=global_normalized_relevance_matrix, relevances_and_positions=edge_weights_and_positions, percentage=ratio) # do not repeat experiment with same number of edges dropped if last_drop is not None and last_drop == dropped_edges: continue else: last_drop = dropped_edges if verbose: log(f"Dropped {dropped_edges} weights of {len(edge_weights_and_positions)} at ratio {ratio}" ) mask_top = padmat(mask_top, padding, zeros=False) # since adjacency matrix in col->row direction, transpose mask, which is currently in row->col direction mask_top = np.transpose(mask_top) mask_top = torch.from_numpy(mask_top) # mask for bottom k edges, note: in row->col direction mask_bottom, droped_edges_bottom = get_mask( matrix=global_normalized_relevance_matrix, relevances_and_positions=edge_weights_and_positions, percentage=ratio, top=False) mask_bottom = padmat(mask_bottom, padding, zeros=False) mask_bottom = np.transpose(mask_bottom) mask_bottom = torch.from_numpy(mask_bottom) assert (adjacency.size() == mask_top.size()) assert (adjacency.size() == mask_bottom.size()) # get new (masked) adjacency matrices a_masked_top = torch.mul(adjacency.double(), mask_top) a_masked_bottom = torch.mul(adjacency.double(), mask_bottom) # perform forward pass with new masked adjancency matrices pred_tensor_top = xgcn(embedding=embedding, adjacency=a_masked_top.float()) pred_tensor_bottom = xgcn(embedding=embedding, adjacency=a_masked_bottom.float()) # determine predicted labels max_idx_top = pred_tensor_top.argmax() max_idx_bottom = pred_tensor_bottom.argmax() pred_label_top = CLASSES[max_idx_top.item()] pred_label_bottom = CLASSES[max_idx_bottom.item()] # sanity check: if nothing was occluded this should be the same prediction as in the original forward pass if ratio == 0: assert pred_label_top == jsonl['prediction'][ 'label'], "Different labels but drop ratio 0.0." assert pred_label_bottom == jsonl['prediction'][ 'label'], "Different labels but drop ratio 0.0." # save everything in the json line if 'occlusion' not in jsonl: jsonl['occlusion'] = {} jsonl['occlusion'][str(ratio)] = {} jsonl['occlusion'][str(ratio)]['dropped_edges'] = dropped_edges jsonl['occlusion'][str(ratio)]['top'] = {} jsonl['occlusion'][str(ratio)]['bottom'] = {} jsonl['occlusion'][str(ratio)]['top']['label'] = pred_label_top jsonl['occlusion'][str(ratio)]['bottom']['label'] = pred_label_bottom jsonl['occlusion'][str(ratio)]['top']['tensor'] = tensor_to_list( pred_tensor_top) jsonl['occlusion'][str(ratio)]['bottom']['tensor'] = tensor_to_list( pred_tensor_bottom) return jsonl
import pymysql from helpers.log import log from helpers.utils import remove_emoji from config import HOSTD, USER, PASS, DB sql_log = log('sql', 'sql.log', 'ERROR') def get_connection() -> pymysql.Connection: """ Function for getting connection data :return: <pymysql.connections.Connection> """ return pymysql.connections.Connection(host=HOSTD, user=USER, password=PASS, db=DB, charset='utf8mb4') def add_user(user_id, first_name, last_name, username): """ Function for adding a user to DB :param user_id: <int> - a id of a user :param first_name: <str> or <None> - user's first name :param last_name: <str> or <None> - user's last name :param username: <str> or <None> - user's nickname :return: <bool> """ connection = get_connection() first_name = remove_emoji(first_name)
import argparse import json import subprocess from helpers.config import config from helpers.log import log if __name__ == '__main__': log('Pipeline invoked...') cfg = config('./config.json') parser = argparse.ArgumentParser() parser.add_argument('--preprocess', type=bool, default=cfg['pipeline']['preprocess']) parser.add_argument('--train', type=bool, default=cfg['pipeline']['train']) parser.add_argument('--explain', type=bool, default=cfg['pipeline']['explain']) parser.add_argument('--postprocess', type=bool, default=cfg['pipeline']['postprocess']) args = parser.parse_args() print(json.dumps(cfg, indent=2)) if args.preprocess: subprocess.call(['python', 'preprocess.py']) if args.train: subprocess.call(['python', 'train.py']) if args.explain: subprocess.call(['python', 'explain.py'])
parser.add_argument('--path_in_explanations_jsonl', type=str, default=cfg['explain']['file_explanations_jsonl']) parser.add_argument('--path_out_top_masked_predictions', type=str, default=cfg['postprocess']['occlusion_experiment']['path_out_top_masked_predictions']) parser.add_argument('--path_out_bottom_masked_predictions', type=str, default=cfg['postprocess']['occlusion_experiment']['path_out_bottom_masked_predictions']) parser.add_argument('--draw_plot', type=bool, default=cfg['postprocess']['occlusion_experiment']['draw_plot']) parser.add_argument('--do_convert_to_latex', type=bool, default=cfg['postprocess']['latex']['doit']) parser.add_argument('--path_out_latex', type=str, default=cfg['postprocess']['latex']['path_out_latex']) parser.add_argument('--max_seq_len', type=int, default=cfg['postprocess']['latex']['max_seq_len']) parser.add_argument('--weight', type=float, default=cfg['postprocess']['latex']['weight']) parser.add_argument('--base', type=float, default=cfg['postprocess']['latex']['base']) parser.add_argument('--crop', type=int, default=cfg['postprocess']['latex']['crop']) args = parser.parse_args() if args.do_plot_occlusion_experiment: log('Summarizing occlusion experiments...') top, bottom = read_explanations(args.path_in_explanations_jsonl) res_top, percentages = occlusion_predictions(top) res_bottom, percentages = occlusion_predictions(bottom) f1_top = [f1_score(t[0], t[1], average='weighted') for t in res_top] # convert to csv f1_top = list(zip(percentages, f1_top)) f1_top = [f'{tup[0]},{tup[1]}' for tup in f1_top] f1_top = '\n'.join(f1_top) f1_bottom = [f1_score(b[0], b[1], average='weighted') for b in res_bottom] f1_bottom = list(zip(percentages, f1_bottom)) f1_bottom = [f'{tup[0]},{tup[1]}' for tup in f1_bottom] f1_bottom = '\n'.join(f1_bottom) with open(args.path_out_top_masked_predictions, 'w+') as fout: fout.write(f1_top) fout.close()