def load_mode(args): core = get_plugin(args.device, args.l, args.config) log.info('IR for {} : {}'.format(args.device, args.model)) log.info('Loading blob from {}'.format(args.load)) net = get_net(model=args.model, core=core) net_layers, net_inputs, net_outputs = get_model_info(net) out_layers = get_layers_list(net_layers, net_inputs, net_outputs, args.layers) print_input_layers(net_inputs) print_output_layers(out_layers) layers_map = manage_user_outputs_with_mapping(mapping=args.mapping, reference_mapping=args.reference_mapping, user_layers=out_layers) inputs = input_processing(args.model, net_inputs, args.input, layers_map) global_accuracy = [] loaded = load_dump(args.load) for out_layer in layers_map: ref_out_layer = layers_map[out_layer] if out_layer == ref_out_layer: log.info('Layer {} statistics'.format(out_layer)) else: log.info('Statistics \'{}\' vs \'{}\''.format(out_layer, ref_out_layer)) net_copy = get_net_copy_with_output(model=args.model, output=out_layer, core=core) results = infer(net=net_copy, core=core, device=args.device, inputs=inputs, output=[out_layer]) if out_layer not in results: continue out_blob, pc = results[out_layer] if ref_out_layer not in loaded: continue ref_out_blob = loaded[ref_out_layer]['blob'] a_m = accuracy_metrics(out_blob=out_blob, ref_out_blob=ref_out_blob) if 'pc' in loaded[ref_out_layer]: ref_pc = loaded[ref_out_layer]['pc'] performance_metrics(pc=pc, ref_pc=ref_pc) blob_counters(out_blob=out_blob, ref_out_blob=ref_out_blob) global_accuracy = update_global_accuracy_matrics(global_accuracy=global_accuracy, current_accuracy=a_m) print_all_over_the_net_metrics(global_accuracy=global_accuracy)
def eff_dump_csv_upload(request): """ Allows to import the logs contained in an Eff formatted file. The file must be structured as one generated by L{EffCsvWriter<sitio.eff.utils.EffCsvWriter>}. If there are any external id's for the logs contained in the file being uploaded not found on the database, then a form that allows to associate these external id's with existing user profiles will be presented to the user (see L{eff_admin_users_association<eff_admin_users_association>}). @param request: the request object @type request: django.core.handlers.wsgi.WSGIRequest @return: The django.http.HttpResponse object for the view """ context = {'title' : 'CSV Dump Upload'} if request.method == 'POST': form = DumpUploadForm(request.POST, request.FILES) if form.is_valid(): n_rows, n_users, n_projects, n_project_assocs, temp_file = load_dump(request.FILES['csv_file'].file) if temp_file: request.session['log_entries_file'] = temp_file request.session['n_users'] = n_users return HttpResponseRedirect('/efi/administration/users_association/') context['notices'] = ['File Uploaded Sucessfully!'] else: context['errors'] = ['Invalid Form'] else: form = DumpUploadForm() context['form'] = form return render_to_response('admin_dump_csv_upload.html', context, context_instance=RequestContext(request))
def main(): parser = ArgumentParser() parser.add_argument('--config', default='./configs/lgb_template.yaml') parser.add_argument('--create-features', action='store_true') options = parser.parse_args() config = yaml.safe_load(open(options.config)) kfold = get_kfold(config) if options.create_features: train_path = get_dataset_filename(config, 'train') test_path = get_dataset_filename(config, 'test') with timer('Load train/test dump files'): train_df = load_dump(train_path) test_df = load_dump(test_path) create_features(config, train_df, test_df, kfold) del train_df, test_df gc.collect() target_col = config['target'] target_path = get_dataset_filename(config, 'target') use_features = extract_use_features(config) x_train = load_train_dataset(use_features) y_train = load_dump(target_path)[target_col] output_dir = config['dataset']['output_directory'] basename = Path(options.config).stem logger = setup_logger(output_dir, basename) clfs, importances = train_model(x_train, y_train, kfold, config, logger) save_feature_importances(importances, basename, output_dir) del x_train, y_train, importances gc.collect() pred = predict_model(clfs, use_features, config) print('Creating a submission csv file...') submission_path = get_dataset_filename(config, 'sample_submit') submission = pd.read_csv(submission_path) submission[target_col] = pred submission.to_csv(f'{output_dir}/submit_{basename}.csv.gz', index=False) print('Done.')
def randomsample(args): """ Produce a dataset by sampling. """ if args.language: langs = args.language elif args.langs: langs = map(str.strip, open(args.langs)) tempfile.tempdir = args.temp now = time.time() handle, path = tempfile.mkstemp(suffix='.tar', prefix='sample-{0}-'.format(args.number)) os.close(handle) #path = os.path.join(args.temp, 'sample-{0}.tar'.format(args.number)) build_index = not(args.skip_index) with tarfile.open(path, 'w') as tar: for lang in langs: try: dump = load_dump(lang, build_index=build_index, unpack=True) except KeyError: logger.error("do not have a dump for %s, skipping", lang) continue chosen = set() #keeps track of ids that have been chosen used = set() #keeps track of ids that have been examined # Adapted from wikicontent while len(chosen) < args.number: logger.debug("chose {0}/{1} so far".format(len(chosen), args.number)) try: id = random.choice(xrange(dump.metadata['size'])) if id in used: if len(used) >= dump.metadata['size']: # We have run out of documents to consider. Bail out. logger.warning("ran out of documents for %s", lang) break raise ReTry("already considered {0}".format(id)) used.add(id) logger.debug("processing {0}".format(id)) page = dump.get_page_by_index(id) if args.clean: # apply mediawiki removal text = remove_mediawiki_syntax(page.text) para = paragraphs(text) content = [] for p in para: p = re.sub('\s',' ', p) if regexps.langref.match(p): continue if regexps.assoc.match(p): continue if regexps.syntax.search(p): continue p = regexps.tripquote.sub('\g<name>',p) p = regexps.doubquote.sub('\g<name>',p) u = p.decode('utf8').strip() if regexps.category_name.search(u): continue content.append(u) if not content: raise ReTry, "No usable content" document = '\n\n'.join(u.encode('utf8') for u in content) + '\n' else: # output raw mediawiki document = page.text if len(document) < args.minlen: raise ReTry, "Too short ({0}<{1})".format(len(document), args.minlen) logger.debug("adding {0}".format(id)) info = tarfile.TarInfo('{0}/{1}'.format(lang, id)) info.size = len(document) info.mtime = now chosen.add(id) tar.addfile(info, StringIO(document)) logger.debug("added {0}".format(id)) except ReTry, e: logger.debug("Reject: %s", e.args[0]) continue logger.info("chose %d documents for %s", len(chosen), lang)
def randomsample(args): """ Produce a dataset by sampling. """ if args.language: langs = args.language elif args.langs: langs = map(str.strip, open(args.langs)) tempfile.tempdir = args.temp now = time.time() path = os.path.join(config.get("paths", "scratch"), "sample-{0}.tar".format(args.number)) build_index = not (args.skip_index) with tarfile.open(path, "w") as tar: for lang in langs: try: dump = load_dump(lang, build_index=build_index, unpack=True) except KeyError: logger.error("do not have a dump for %s, skipping", lang) continue chosen = set() # keeps track of ids that have been chosen used = set() # keeps track of ids that have been examined # Adapted from wikicontent while len(chosen) < args.number: logger.debug("chose {0}/{1} so far".format(len(chosen), args.number)) try: id = random.choice(xrange(dump.metadata["size"])) if id in used: if len(used) >= dump.metadata["size"]: # We have run out of documents to consider. Bail out. logger.warning("ran out of documents for %s", lang) break raise ReTry("already considered {0}".format(id)) used.add(id) logger.debug("processing {0}".format(id)) page = dump.get_page_by_index(id) if args.clean: # apply mediawiki removal text = remove_mediawiki_syntax(page.text) para = paragraphs(text) content = [] for p in para: p = re.sub("\s", " ", p) if regexps.langref.match(p): continue if regexps.assoc.match(p): continue if regexps.syntax.search(p): continue p = regexps.tripquote.sub("\g<name>", p) p = regexps.doubquote.sub("\g<name>", p) u = p.decode("utf8").strip() if regexps.category_name.search(u): continue content.append(u) if not content: raise ReTry, "No usable content" document = "\n\n".join(u.encode("utf8") for u in content) + "\n" else: # output raw mediawiki document = page.text if len(document) < args.minlen: raise ReTry, "Too short ({0}<{1})".format(len(text), args.minlen) logger.debug("adding {0}".format(id)) info = tarfile.TarInfo("{0}/{1}".format(lang, id)) info.size = len(document) info.mtime = now chosen.add(id) tar.addfile(info, StringIO(document)) logger.debug("added {0}".format(id)) except ReTry, e: logger.debug("Reject: %s", e.args[0]) continue logger.info("chose %d documents for %s", len(chosen), lang)