def run_evaluate(self, sess, test, tags): """ Evaluates performance on test set Args: sess: tensorflow session test: dataset that yields tuple of sentences, tags tags: {tag: index} dictionary Returns: accuracy f1 score """ accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. for words, labels in minibatches(test, self.config.batch_size): labels_pred, sequence_lengths = self.predict_batch(sess, words) for lab, lab_pred, length in zip(labels, labels_pred, sequence_lengths): lab = lab[:length] lab_pred = lab_pred[:length] accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, tags, self.config.DEFAULT)) lab_pred_chunks = set( get_chunks(lab_pred, tags, self.config.DEFAULT)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return acc, f1
def evaluate(self, test): """ evaluates performance on test set :param test: dataset that yields tuple of (sentences, tags) :return: metrics: (dict) metrics['acc'] = 98.4, ... """ accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. test_data, sequence_lengths = test.get_batch(test.size()) test_words = [instance[0] for instance in test_data] test_labels = [instance[1] for instance in test_data] pred_labels, pred_scores = self.predict(test_words, sequence_lengths) for lab, lab_pred, length in zip(test_labels, pred_labels, sequence_lengths): lab = lab[:length] lab_pred = lab_pred[:length] accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(utils.get_chunks(lab, self.config.vocab_labels)) lab_pred_chunks = set( utils.get_chunks(lab_pred, self.config.vocab_labels)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if total_preds > 0 else 0 r = correct_preds / total_correct if total_correct > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) score = np.mean(pred_scores) return {'score': score, 'acc': acc, 'f1': f1, 'p': p, 'r': r}
def run_evaluate(self, sess, test, tags): """ Evaluates performance on test set Args: sess: tensorflow session test: dataset that yields tuple of sentences, tags tags: {tag: index} dictionary Returns: accuracy f1 score """ accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. output_file = codecs.open("output", 'w', 'UTF-8') idx_to_tag = {idx: tag for tag, idx in tags.items()} for words, labels, iob_gold, mention_type_gold, mentions_gold, word_features, char_features in minibatches(test, self.config.batch_size): iob_labels_pred, sequence_lengths= self.predict_iob_batch(sess, words, word_features, char_features) mentions = [] mention_sizes = [] count = 0 for i in range(self.config.batch_size): length = sequence_lengths[i] mention = find_mentions(iob_labels_pred[i][:length]) mentions.append(mention) mention_sizes.append(len(mention)) if len(mention) == 0: count += 1 if count != self.config.batch_size: mentions_pred, _ = self.predict_type_batch(sess, words, word_features, char_features, mentions) else: mentions_pred = [[]]*self.config.batch_size for lab, iob_pred, length, mention, mention_pred, mention_size in zip(labels, iob_labels_pred, sequence_lengths, mentions, mentions_pred, mention_sizes): lab = lab[:length] iob_pred = iob_pred[:length] mention_pred = mention_pred[:mention_size] lab_pred = find_labels(iob_pred, mention_pred, tags, self.id2type) accs += [a==b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, tags)) lab_pred_chunks = set(get_chunks(lab_pred, tags)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) output_string = "" for b, c in zip(lab, lab_pred): split_line = [] split_line.append(idx_to_tag[b]) split_line.append(idx_to_tag[c]) output_string += ' '.join(split_line) + '\n' output_file.write(output_string+'\n') p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return acc, f1
def Eval(sess, tagger, data, num_steps, best_eval_metric, name): """Evaluates a network and checkpoints it to disk. Args: sess: tensorflow session to use parser: graph builder containing all ops references num_steps: number of training steps taken, for logging best_eval_metric: current best eval metric, to decide whether this model is the best so far Returns: new best eval metric """ logging.info('Evaluating training network.') t = time.time() num_epochs = None epochs = 0 logging.info(data.get_sent_num(name)) epochs, sent_batch = utils.loadBatch(FLAGS.batch_size, epochs, data, name) number_of_words = 0 while True: sent_batch, epochs, feature_endpoints, gold_tags, words = utils.get_current_features( sent_batch, epochs, data, name) predictions, tf_eval_metrics = sess.run( [tagger.evaluation['predictions'], tagger.evaluation['logits']], feed_dict={tagger.test_input: feature_endpoints}) utils.set_current_tags(sent_batch, predictions) if num_epochs is None: num_epochs = epochs elif num_epochs < sent_batch[0].get_epoch(): break t_end = time.time() data.reset_index(name) for sent in sent_batch: sent.reset_state() accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. while data.has_next_sent(name): sent = data.get_next_sent(name) words = sent.get_word_list() number_of_words += len(words) gold_labels = sent.ner_ids accs += [a == b for (a, b) in zip(gold_labels, sent.output_tags)] lab_chunks = set(utils.get_chunks(gold_labels, data.id2tag)) lab_pred_chunks = set(utils.get_chunks(sent.output_tags, data.id2tag)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) test_time = t_end - t p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 logging.info("f1 score:") logging.info(f1) logging.info(number_of_words) data.reset_index(name) return test_time
def run_qa(): client = MongoClient(config.MONGO_IP, config.MONGO_PORT) db = client[config.DB] wikipedia = db[config.WIKIPEDIA_COLLECTION] wikidocs = list( wikipedia.find({}, { 'wikidata_id': 1, '_id': 0 }).sort('wikidata_id')) chunks = get_chunks(wikidocs, config.CHUNK_SIZE, 'wikidata_id') del wikidocs start_time = time.time() total = 0 pool = multiprocessing.Pool(config.NUM_WORKERS) for res in pool.imap(qa, chunks): total += res['processed'] res['total'] = total part = int(time.time() - start_time) res['elapsed'] = compress(res['elapsed']) res['total_elapsed'] = compress(part) logging.info( "Processed {processed} ({total} in total) documents in {elapsed} (running time {" "total_elapsed})".format(**res)) pool.terminate() elapsed = int(time.time() - start_time) logging.info("Processed {} documents in {}".format(total, compress(elapsed))) return
async def do_detect(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult: window_size = self._detector.get_window_size(cache) chunk_size = window_size * self.CHUNK_WINDOW_SIZE_FACTOR chunk_intersection = window_size * self.CHUNK_INTERSECTION_FACTOR detections: List[DetectionResult] = [] chunks = [] # XXX: get_chunks(data, chunk_size) == get_intersected_chunks(data, 0, chunk_size) if self._detector.is_detection_intersected(): chunks = get_intersected_chunks(data, chunk_intersection, chunk_size) else: chunks = get_chunks(data, chunk_size) for chunk in chunks: await asyncio.sleep(0) chunk_dataframe = prepare_data(chunk) detected: DetectionResult = self._detector.detect( chunk_dataframe, cache) detections.append(detected) if len(detections) == 0: raise RuntimeError( f'do_detect for {self.analytic_unit_id} got empty detection results' ) detection_result = self._detector.concat_detection_results(detections) return detection_result.to_json()
def __normalize_treelevels(self): """ Normalize the treelevels so they can be used to generate the tree without problems. The normalized treelevels must fulfill the condition that at any given level the number of nodes of that level must be at least equal (or higher) than the number of blocks of the next level. With the exepction of the root. """ root = self.treelevels.pop(0) while True: modified = False for x, y in get_chunks(self.treelevels, 2, 1): if len(list(chain.from_iterable(x))) < len(y): modified = True # Find the smallest block of y and move it # to the previous level position = 0 min_value = float('inf') for pos, value in enumerate(map(len, y)): if min_value < value: position = pos x.append(y[position]) y.pop(position) if not modified: break self.treelevels.insert(0, root)
def download_file(self, file_id, file_key, public=False): if public: file_key = base64_to_a32(file_key) file_data = self.api_req({'a': 'g', 'g': 1, 'p': file_id}) else: file_data = self.api_req({'a': 'g', 'g': 1, 'n': file_id}) k = (file_key[0] ^ file_key[4], file_key[1] ^ file_key[5], file_key[2] ^ file_key[6], file_key[3] ^ file_key[7]) iv = file_key[4:6] + (0, 0) meta_mac = file_key[6:8] file_url = file_data['g'] file_size = file_data['s'] attributes = base64urldecode(file_data['at']) attributes = dec_attr(attributes, k) file_name = attributes['n'] infile = requests.get(file_url, stream=True).raw outfile = open(file_name, 'wb') counter = Counter.new( 128, initial_value=((iv[0] << 32) + iv[1]) << 64) decryptor = AES.new(a32_to_str(k), AES.MODE_CTR, counter=counter) file_mac = (0, 0, 0, 0) for chunk_start, chunk_size in sorted(get_chunks(file_size).items()): chunk = infile.read(chunk_size) chunk = decryptor.decrypt(chunk) outfile.write(chunk) chunk_mac = [iv[0], iv[1], iv[0], iv[1]] for i in range(0, len(chunk), 16): block = chunk[i:i+16] if len(block) % 16: block += '\0' * (16 - (len(block) % 16)) block = str_to_a32(block) chunk_mac = [ chunk_mac[0] ^ block[0], chunk_mac[1] ^ block[1], chunk_mac[2] ^ block[2], chunk_mac[3] ^ block[3]] chunk_mac = aes_cbc_encrypt_a32(chunk_mac, k) file_mac = [ file_mac[0] ^ chunk_mac[0], file_mac[1] ^ chunk_mac[1], file_mac[2] ^ chunk_mac[2], file_mac[3] ^ chunk_mac[3]] file_mac = aes_cbc_encrypt_a32(file_mac, k) outfile.close() # Integrity check if (file_mac[0] ^ file_mac[1], file_mac[2] ^ file_mac[3]) != meta_mac: raise ValueError('MAC mismatch')
def Evaluate(sess, model, dataset, transition_params_trained, parameters, epoch_num): start = time.time() accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. word_count = 0 while dataset.has_next_sent('test'): sent = dataset.get_next_sent('test') feed_dict = { model.input_token_indices: sent.word_ids, model.input_token_character_indices: utils.pad_lists(sent.char_lists), model.input_token_lengths: sent.word_lengths, model.dropout_keep_prob: 1 } unary_scores, predictions = sess.run( [model.unary_scores, model.predictions], feed_dict) if parameters['use_crf']: predictions, _ = tf.contrib.crf.viterbi_decode( unary_scores, transition_params_trained) predictions = predictions[1:-1] gold_labels = sent.ner_ids words = sent.word_ids word_count += len(words) accs += [a == b for (a, b) in zip(gold_labels, predictions)] lab_chunks = set(utils.get_chunks(gold_labels, dataset.ner_map)) lab_pred_chunks = set(utils.get_chunks(predictions, dataset.ner_map)) #logging.info(sent.ner_ids) #logging.info(predictions) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 test_time = time.time() - start dataset.reset_index('test') logging.info("epoch: %d, f1 score: %.2f", epoch_num, f1 * 100.0) return test_time
def parse_date_string(self, time_frames): retval = [] tmp_list = [] for word in time_frames: if word == '-': continue tmp_list.append(word) if re.search(CLOCK_PATTERN, word): retval.append(' '.join(tmp_list)) tmp_list = [] return utils.get_chunks(retval, 2)
def init(): for i, fn in enumerate(BACKING_FNs): p = Player(i + 1) players.append(p) p.volume = 1.0 data_file = wave.open(fn, 'rb') data = data_file.readframes(data_file.getnframes()) sequence_number = 0 for d in get_chunks(data, FRAMES_PER_PACKET * FRAME_WIDTH): p.audio_packets[sequence_number] = d sequence_number += 1
def run_infer(self, sess, test, tags): """ Evaluates performance on test set Args: sess: tensorflow session test: dataset that yields tuple of sentences, tags tags: {tag: index} dictionary Returns: accuracy f1 score """ infer_res = open(self.config.infer_filename, 'w', encoding="utf-8-sig") accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. for words, labels in minibatches(test, self.config.batch_size): words_copy = copy.deepcopy(words) labels_pred, sequence_lengths = self.predict_batch(sess, words) # print("predict_batch", labels_pred, sequence_lengths,words_copy) if self.config.chars: _, words_res = zip(*words_copy) else: words_res = words_copy for word_res, lab, lab_pred, length in zip( words_res, labels, labels_pred, sequence_lengths): lab = lab[:length] lab_pred = lab_pred[:length] # print("idx_restore", word_res, lab, lab_pred) infer_res.write(self.idx_restore(word_res, lab, lab_pred)) accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, tags, self.config.DEFAULT)) lab_pred_chunks = set( get_chunks(lab_pred, tags, self.config.DEFAULT)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) infer_res.close() p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return acc, f1
def evaluate(self, dev_x, dev_y): """ Evaluates performance on dev set """ accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. for i, (x_batch, y_batch) in enumerate( next_batch(dev_x, dev_y, self.config.batch_size, shuffle=True)): fd, sentence_lengths, label_padded, _ = self.get_fd( x_batch, y_batch) scores, trans_params = self.sess.run( [self.scores, self.trans_params], feed_dict=fd) viterbi_sequences = self.viterbi_decode(scores, sentence_lengths, trans_params) for lab, lab_pred, length in zip(label_padded, viterbi_sequences, sentence_lengths): lab = lab[:length] lab_pred = lab_pred[:length] accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, self.config.idx2label)) lab_pred_chunks = set( get_chunks(lab_pred, self.config.idx2label)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return {"acc": 100 * acc, "f1": 100 * f1}
def test_non_intersected_chunks(self): chunk_size = 4 cases = [(tuple(range(12)), [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]), (tuple(range(9)), [[0, 1, 2, 3], [4, 5, 6, 7], [8]]), (tuple(range(10)), [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9]]), (tuple(range(11)), [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10]]), ([], []), (tuple(range(1)), [[0]]), (tuple(range(4)), [[0, 1, 2, 3]])] for tested, expected in cases: tested_chunks = list(get_chunks(tested, chunk_size)) self.assertSequenceEqual(tested_chunks, expected)
def __generate_treelinks(self): """ Generate links for the current graph that create a tree. This function generates the tree_links that will populate the links of the graph. The class works in an incremental fashion, first the links to create a graph are generated and then the tree is turned into a DAG. """ tree_links = [] # Process the root root = self.Position(0, 0, 0) allSource = set() allDest = set() for block, b in enumerate(self.treelevels[1]): for position, x in enumerate(b): dest = self.Position(1, block, position) allDest.add(dest) tree_links.append(self.GraphLink(root, dest, 0)) for level, (x, y) in enumerate(get_chunks(self.treelevels[1:-1], 2), start=1): election_positions = [] for block, b in enumerate(x): for position, _ in enumerate(b): election_positions.append( self.Position(level, block, position)) shuffle(election_positions) for dest_block, block in enumerate(y): if not election_positions: print "Error::The tree levels are not normalized" sys.exit(0) orig_position = election_positions.pop() allSource.add(orig_position) for dest_position, node in enumerate(block): dest_position = self.Position(level + 1, dest_block, dest_position) tree_links.append( self.GraphLink(orig_position, dest_position, 0)) allDest.add(dest_position) # Process exit node exit_position = self.Position(level + 2, 0, 0) for lastNode in allDest.difference(allSource): tree_links.append(self.GraphLink(lastNode, exit_position, 0)) return tree_links
def init(): for i, fn in enumerate(BACKING_FNs): p = Player(i+1) globals.Players.append(p) p.volume = 0.25 data_file = wave.open(fn, 'rb') data = data_file.readframes(data_file.getnframes()) sequence_number = 0 for d in get_chunks(data, FRAMES_PER_PACKET * FRAME_WIDTH): #TODO: this implies you can't switch order of instruments p.audio_packets[sequence_number] = d sequence_number += 1 packet = create_audio_packet(-1, create_zeros(FRAMES_PER_PACKET)) network.output_audio_queue.append(packet)
def run_evaluate(self, test): accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. for words, labels in batch_yield(test, self.batch_size): labels_pred, sequence_lengths = self.predict_batch(words) for lab, lab_pred, length in zip(labels, labels_pred, sequence_lengths): lab = lab[:length] lab_pred = lab_pred[:length] accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, self.label2id)) lab_pred_chunks = set(get_chunks(lab_pred, self.label2id)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return acc, p, r, f1
def split(args): """Compute base background in split and use it in each chunk """ n_peaks = utils.quick_line_count(args.peaks) if args.peaks else 0 ref_mgr = ReferenceManager(args.reference_path) if len(ref_mgr.list_species()) > 1 or n_peaks == 0 or ref_mgr.tss_track is None: chunk_def = [{'skip': True}] return {'chunks': chunk_def} # write rows of each chunk to a new peak file mem_in_gb = 4.0 chunk_def = [{'__mem_gb': mem_in_gb, 'skip': False, 'chunk_start': chunk[0], 'chunk_end': chunk[1]} for chunk in utils.get_chunks(n_peaks, chunks=20)] return {'chunks': chunk_def}
def parse_time_frames(self): modified_tf = [] if self.args.time_frames[0].isdigit(): for chunk in utils.get_chunks(self.args.time_frames, 3): if re.search(r'[0-9]+-[0-9]+', ''.join(chunk)): re.compile('[,]') modified_tf.append([ int(re.compile('[,]').sub('', chunk[0])), int(re.compile('[,]').sub('', chunk[2])) ]) else: time_frames = self.parse_date_string(self.args.time_frames) for tf in time_frames: modified_tf.append([ utils.parse_date(tf[0]).timestamp(), utils.parse_date(tf[1]).timestamp() ]) return modified_tf
def split(args): if args.fragments is None: return {'chunks': [], 'join': {}} # as the fragments file is not sorted by barcodes, we iterate through the files to get a list of ordered bcs barcodes = list( {bc for _, _, _, bc, _ in open_fragment_file(args.fragments)}) # chunk on barcodes barcode_chunks = utils.get_chunks(len(barcodes), 30) chunks = [] for num, bc_chunk in enumerate(barcode_chunks): bc_path = martian.make_path('barcode_{}.txt'.format(num)) with open(bc_path, 'w') as f: f.write('\n'.join(barcodes[bc_chunk[0]:bc_chunk[1]])) chunks.append({'barcodes': bc_path}) return {'chunks': chunks, 'join': {'__mem_gb': 16}}
def callback(in_data, frame_count, time_info, status): global output_start_time for i, frames in enumerate( get_chunks(in_data, FRAMES_PER_PACKET * FRAME_WIDTH)): if output_start_time == 0: continue sample_gap = 1.0 / FRAME_RATE packet_time = time_info['input_buffer_adc_time'] + \ (i * sample_gap * FRAMES_PER_PACKET) - \ output_start_time seq_num = int(packet_time / (FRAMES_PER_PACKET * sample_gap)) if seq_num >= 0: input_queue.append((seq_num, frames)) out_data = create_zeros(frame_count) if len(output_queue): if output_start_time == 0: output_start_time = time_info['output_buffer_dac_time'] out_data = output_queue.pop(0) return (out_data, pa.paContinue)
async def consume_data(self, data: TimeSeries, cache: Optional[ModelCache]) -> Optional[dict]: window_size = self._detector.get_window_size(cache) detections: List[DetectionResult] = [] for chunk in get_chunks(data, window_size * self.CHUNK_WINDOW_SIZE_FACTOR): await asyncio.sleep(0) chunk_dataframe = prepare_data(chunk) detected = self._detector.consume_data(chunk_dataframe, cache) if detected is not None: detections.append(detected) if len(detections) == 0: return None else: detection_result = self._detector.concat_detection_results( detections) return detection_result.to_json()
def build(self): result = self.data_base.reverse_index.delete_many({}) print(result.deleted_count) result = self.data_base.user_length.delete_many({}) print(result.deleted_count) for user in tqdm.tqdm(self.data_base.forward_index.find(), total=self.data_base.forward_index.count()): splitted = user['text'].split() self.user_length[user['uid']] = len(splitted) [self.update_reverse(token, user['uid']) for token in splitted] subprocess.run('sudo service mongod stop'.split()) time.sleep(2) subprocess.run('sudo service mongod start'.split()) time.sleep(2) token_chunks = get_chunks(list(self.reverse_index.keys()), ReverseIndex.CHUNK_SIZE) for chunk in tqdm.tqdm(token_chunks, total=len(token_chunks)): local_index = dict() for token in chunk: local_index[token] = list( zip(self.reverse_index[token].keys(), self.reverse_index[token].values())) chunk_insertion = [{ 'token': token, 'uids_freqs': local_index[token] } for token in local_index] self.data_base.reverse_index.insert_many(chunk_insertion) length_insertions = [{ 'uid': uid, 'length': self.user_length[uid] } for uid in self.user_length] self.data_base.user_length.insert_many(length_insertions)
async def process_data(self, data: TimeSeries, cache: ModelCache) -> dict: assert isinstance(self._detector, detectors.ProcessingDetector), \ f'{self.analytic_unit_id} detector is not ProcessingDetector, can`t process data' assert cache is not None, f'{self.analytic_unit_id} got empty cache for processing data' processed_chunks = [] window_size = self._detector.get_window_size(cache) for chunk in get_chunks(data, window_size * self.CHUNK_WINDOW_SIZE_FACTOR): await asyncio.sleep(0) chunk_dataframe = prepare_data(chunk) processed = self._detector.process_data(chunk_dataframe, cache) if processed is not None: processed_chunks.append(processed) if len(processed_chunks) == 0: raise RuntimeError( f'process_data for {self.analytic_unit_id} got empty processing results' ) # TODO: maybe we should process all chunks inside of detector? result = self._detector.concat_processing_results(processed_chunks) return result.to_json()
def __generate_treelevels(self, root, exitNode, nodelists, depth): """ Generate the levels of the the tree using the nodelists. root -> root of the tree. nodelists -> A list of lists containing nodes. depth -> The depth of the tree Return a list of lists. """ res = [[[root]], [nodelists[0]]] if depth <= 2: depth = 3 lists_per_level = (len(nodelists) - 1) / (depth - 2) if lists_per_level <= 0: print "Warning::The specified depth is too big" self.valid_graph = False lists_per_level = 1 return res + list( get_chunks(nodelists[1:], lists_per_level, lists_per_level)) + [[[exitNode]]]
def main(): parser = argparse.ArgumentParser( description='Calculate QoE and error for PanoSalNet algorithm') parser.add_argument('-D', '--dataset', type=int, required=True, help='Dataset ID (1 or 2)') parser.add_argument('-T', '--topic', required=True, help='Topic in the particular Dataset (video name)') parser.add_argument('--fps', type=int, required=True, help='fps of the video') parser.add_argument( '-Q', '--quality', required=True, help= 'Preferred bitrate quality of the video (360p, 480p, 720p, 1080p, 1440p)' ) args = parser.parse_args() if args.dataset != 1 and args.dataset != 2: print("Incorrect value of the Dataset ID provided!!...") print("======= EXIT ===========") exit() # Get the necessary information regarding the dimensions of the video print("Reading JSON...") file = open('./meta.json', ) jsonRead = json.load(file) nusers = jsonRead["dataset"][args.dataset - 1]["nusers"] width = jsonRead["dataset"][args.dataset - 1]["width"] height = jsonRead["dataset"][args.dataset - 1]["height"] view_width = jsonRead["dataset"][args.dataset - 1]["view_width"] view_height = jsonRead["dataset"][args.dataset - 1]["view_height"] milisec = jsonRead["dataset"][args.dataset - 1]["milisec"] pref_bitrate = jsonRead["bitrates"][args.quality] ncol_tiles = jsonRead["ncol_tiles"] nrow_tiles = jsonRead["nrow_tiles"] player_width = jsonRead["player_width"] player_height = jsonRead["player_height"] player_tiles_x = math.ceil(player_width * ncol_tiles * 1.0 / width) player_tiles_y = math.ceil(player_height * nrow_tiles * 1.0 / height) PATH_ACT = '../../Viewport/ds{}/'.format(args.dataset) PATH_PRED = './head_prediction/ds{}/'.format(args.dataset) manhattan_error, x_mae, y_mae, final_qoe = [], [], [], [] count_frames = 0 for usernum in range(nusers): print('User_{}'.format(usernum)) user_manhattan_error = 0. viewport = pickle.load(open( PATH_ACT + "viewport_ds{}_topic{}_user{}".format(dataset, topic, usernum + 1), "rb"), encoding='latin1') p_viewport = pickle.load(open( PATH_PRED + "topic{}_user{}".format(topic, usernum), "rb"), encoding="latin1") frame_nos = [] act_viewport, frame_nos, max_frame = get_act_tiles( viewport, frame_nos, args.fps, args.milisec, width, height, view_width, view_height) # Predicted Tile = max of the probabilities in output pred_max_viewport = [] for fr in range(len(p_viewport)): prob = p_viewport[fr] argmax = np.where(prob == prob.max()) pred_max_viewport.append((argmax[0][0], argmax[1][0])) # Assert len(actual frames) = len(predicted frames) pred_viewport = p_viewport act_viewport = act_viewport[:len(pred_viewport)] frame_nos = frame_nos[:len(pred_viewport)] pred_viewport = pred_viewport[:len(act_viewport)] frame_nos = frame_nos[:len(pred_viewport)] # Calculate Manhattan Error for fr in range(len(pred_max_viewport)): act_tile = act_viewport[fr] pred_tile = pred_max_viewport[fr] # Get corrected error tile_col_dif = ncol_tiles tile_row_dif = act_tile[0] - pred_tile[0] tile_col_dif = min( pred_tile[1] - act_tile[1], act_tile[1] + ncol_tiles - pred_tile[1]) if act_tile[1] < pred_tile[1] else min( act_tile[1] - pred_tile[1], ncol_tiles + pred_tile[1] - act_tile[1]) current_tile_error = abs(tile_row_dif) + abs(tile_col_dif) user_manhattan_error += current_tile_error manhattan_error.append(user_manhattan_error / len(pred_max_viewport)) count_frames += len(act_viewport) act_tiles, pred_tiles, chunk_frames = get_chunks( act_viewport, pred_viewport, frame_nos, max_frame, args.fps) # Allocate bitrate vid_bitrate = alloc_bitrate(pred_tiles, chunk_frames, nrow_tiles, ncol_tiles, pref_bitrate) # Calculate QoE q = calc_qoe(vid_bitrate, act_tiles, frame_nos, chunk_frames, width, height, nrow_tiles, ncol_tiles, player_width, player_height) final_qoe.append(q) avg_qoe = np.mean(final_qoe) avg_manhattan_error = np.mean(manhattan_error) #Print averaged results print("\n======= RESULTS ============") print('PanoSalNet') print('Dataset: {}'.format(args.dataset)) print('Topic: ' + args.topic) print('Pred_nframe: {}'.format(args.fps)) print('Avg. QoE: {}'.format(avg_qoe)) print('Avg. Manhattan error: {}'.format(avg_manhattan_error)) print('Count: {}'.format(count_frames)) print('\n\n')
import utils if __name__ == '__main__': accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. file_input = codecs.open('005_test.txt', 'r', 'UTF-8') format_len = 8 predictions = [] gold_labels = [] count = 0 for cur_line in file_input: cur_line = cur_line.strip() entity = cur_line.split() if len(entity) == format_len: predictions.append(entity[-2]) gold_labels.append(entity[-1]) else: lab_chunks = set(utils.get_chunks(gold_labels)) lab_pred_chunks = set(utils.get_chunks(predictions)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) gold_labels = [] predictions = [] p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 print f1 file_input.close()
required=True, choices=['train', 'valid', 'test'], help="train/valid/test data?") args = parser.parse_args() assert os.path.exists(args.src_fname), "file {} not found".format( args.src_fname) assert os.path.exists(args.ref_fname), "file {} not found".format( args.ref_fname) src_lines = read_file(args.src_fname) ref_lines = read_file(args.ref_fname) assert len(src_lines) == len(ref_lines), \ "src has {} lines but ref has {} lines".format(len(src_lines), len(ref_lines)) src_lines_chunked = get_chunks(src_lines, args.num_splits) ref_lines_chunked = get_chunks(ref_lines, args.num_splits) with parallel_backend('multiprocessing', n_jobs=args.num_splits): Parallel()(delayed(write_to_presumm_format) (chunk_idx, src_chunk, ref_chunk, args.split, args.presumm_out, args.output_name) for chunk_idx, (src_chunk, ref_chunk) in enumerate( zip(src_lines_chunked, ref_lines_chunked))) """ # To run export BASE_DATA_PATH=/projects/tir5/users/aashfaq/Capstone/data/genetic/combined export DATA_DIR=$BASE_DATA_PATH/bertsum_data_train/ mkdir $DATA_DIR python -m write_to_presumm_format -src_fname $BASE_DATA_PATH/train.ext -ref_fname $BASE_DATA_PATH/train.target -presumm_out $DATA_DIR -num_splits 20 -split train
def add_malware(): ''' Adds a sample to the repository. Performs hashing and filemagic analysis of the uploaded sample. @tags : comma seperated tags list @file : binary sample stream returns : JSON status message ''' try: with timeout(Config().api.timeout * 60, exception=RuntimeError): tags = request.forms.get('tags').split(',') data = request.files.file data.file.seek(0) filename = data.filename sampleData = data.file.read() logging.debug('[%s] Generating hashes' % sampleEntry) md5 = hashlib.md5(sampleData).hexdigest() sha1 = hashlib.sha1(sampleData).hexdigest() sha256 = hashlib.sha256(sampleData).hexdigest() sha512 = hashlib.sha512(sampleData).hexdigest() filetype = get_type(sampleData) key = {'md5': md5} logging.debug('Quering database for already existing file (hash=%s)' % md5) existing = db.fs.files.find_one({'md5': md5}) upload_sample = True if existing: logging.info('Sample already exists') logging.info('Verifying contents') if not md5 == existing['md5']: logging.warning('Checksum not matching') upload_sample = True else: logging.info('Checksum matching') upload_sample = False else: upload_sample = True if upload_sample: logging.debug('Uploading sample') new = fs.new_file(filename=filename, sha1=sha1, sha256=sha256, sha512=sha512, filetype=filetype) for chunk in get_chunks(sampleData): logging.debug('writing chunk') new.write(chunk) new.close() logging.info('Uploaded sample') add_tags(key=key, tags=tags) logging.debug('Reclaiming memory') del sampleData response.content_type = 'application/json' return jsonize({'message': 'added'}) except RuntimeError: response.content_type = 'application/json' return (jsonize({'error': 'timeout'}), 504)
def Eval(sess): """Builds and evaluates a network.""" logging.set_verbosity(logging.INFO) #bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@") wordMapPath = "ner_word_index" nerMapPath = "ner_index" pMapPath = "ner_prefix_index" sMapPath = "ner_suffix_index" prefix2id = utils.read_pickle_file(pMapPath) suffix2id = utils.read_pickle_file(sMapPath) word2id = utils.read_pickle_file(wordMapPath) tag2id = utils.read_pickle_file(nerMapPath) loading_time = time.time() logging.info("loading data and precomputing features...") dataset = Dataset(None, None, FLAGS.test_corpus, format_list=['FORM', 'NER']) dataset.load_dataset(word2id=word2id, tag2id=tag2id, prefix2id=prefix2id, suffix2id=suffix2id, fgen=False) name = 'test' logging.info('training sentences: %d', dataset.get_sent_num(name)) logging.info("logging time: %.2f", time.time() - loading_time) if FLAGS.word_only: feature_sizes = [8] domain_sizes = [dataset.vocabulary_size] embedding_dims = [100] else: feature_sizes = [ 8, 8, 2, 8, 8, 4 ] #num of features for each feature group: capitalization, words, prefix_2, suffix_2, tags_history domain_sizes = [ dataset.vocabulary_size, 3, 3, dataset.prefix_size, dataset.suffix_size, dataset.number_of_classes + 1 ] embedding_dims = [100, 8, 8, 50, 50, 50] num_actions = dataset.number_of_classes hidden_layer_sizes = map(int, FLAGS.hidden_layer_sizes.split(',')) tagger = GreedyTagger(num_actions, feature_sizes, domain_sizes, embedding_dims, hidden_layer_sizes, gate_gradients=True) tagger.AddEvaluation(FLAGS.batch_size) tagger.AddSaver() sess.run(tagger.inits.values()) tagger.saver.restore(sess, FLAGS.model_path) logging.info('Evaluating training network.') t = time.time() num_epochs = None epochs = 0 epochs, sent_batch = utils.loadBatch(FLAGS.batch_size, epochs, dataset, name) number_of_words = 0 while True: sent_batch, epochs, feature_endpoints, gold_tags, words = utils.get_current_features( sent_batch, epochs, dataset, name, FLAGS.word_only) predictions, tf_eval_metrics = sess.run( [tagger.evaluation['predictions'], tagger.evaluation['logits']], feed_dict={tagger.test_input: feature_endpoints}) utils.set_current_tags(sent_batch, predictions) if num_epochs is None: num_epochs = epochs elif num_epochs < sent_batch[0].get_epoch(): break t_end = time.time() dataset.reset_index(name) for sent in sent_batch: sent.reset_state() accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. output_file = codecs.open(FLAGS.output_path, 'w', 'UTF-8') while dataset.has_next_sent(name): sent = dataset.get_next_sent(name) words = sent.get_word_list() number_of_words += len(words) gold_labels = sent.ner_ids accs += [a == b for (a, b) in zip(gold_labels, sent.output_tags)] lab_chunks = set(utils.get_chunks(gold_labels, dataset.id2tag)) lab_pred_chunks = set( utils.get_chunks(sent.output_tags, dataset.id2tag)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) output_string = "" for a, b, c in zip(words, gold_labels, sent.output_tags): split_line = [a, dataset.id2tag[b], dataset.id2tag[c]] output_string += ' '.join(split_line) + '\n' output_file.write(output_string + '\n') test_time = t_end - t output_file.close() p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 logging.info("f1 score: %.2f; number of words: %d", f1, number_of_words)
def evaluate(self, test, is_test_set=False): x_test = test[0] y_test = test[1] accs = [] wrong_predictions = [] lab_c = [] pred_c = [] """ wrong_predictions is a list of tuples of type (sentence, fp_set, fn_set, lab, lab_pred) """ correct_preds, total_correct, total_preds = 0., 0., 0. for sentences_batch, labels_batch in get_minibatch( (x_test, y_test), self.config.batch_size): labels_pred_batch, sequence_lengths_batch = self.predict_batch( sentences_batch) sentence_index = 0 for lab, lab_pred, length in zip(labels_batch, labels_pred_batch, sequence_lengths_batch): lab = lab[:length] lab_pred = lab_pred[:length] accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set( get_chunks(lab, self.config.vocab_tags, self.config)) lab_pred_chunks = set( get_chunks(lab_pred, self.config.vocab_tags, self.config)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) lab_c.append(lab) pred_c.append(lab_pred) fp_preds = lab_pred_chunks - lab_chunks fn_preds = lab_chunks - lab_pred_chunks if is_test_set and (len(fp_preds) != 0 or len(fn_preds) != 0): wrong_pred = (sentences_batch[sentence_index], fp_preds, fn_preds, lab, lab_pred) # print len(fp_preds) + len(lab_chunks & lab_pred_chunks) # print len(fn_preds) + len(lab_chunks & lab_pred_chunks) # print len(lab_pred_chunks) # print len(lab_chunks) # print fp_preds # print fn_preds wrong_predictions.append(wrong_pred) sentence_index += 1 p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) ''' print "Correct: " + str(correct_preds) print "Total Pred: " + str(total_preds) print "Total Correct: " + str(total_correct) ''' if is_test_set: print "Precision: " + str(p) print "Recall: " + str(r) print "F1: " + str(f1) #if is_test_set: # write_wrong_predictions_to_file(wrong_predictions, self.config) pdir = self.config.dir_model import pickle pickle.dump(lab_c, open(pdir + 'lab_.pkl', 'w'), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(pred_c, open(pdir + 'pred_.pkl', 'w'), protocol=pickle.HIGHEST_PROTOCOL) return { "acc": 100 * acc, "f1": 100 * f1, "Precision": 100 * p, "Recall": 100 * r }
for i in range(num_epochs): """ Set variables to zero """ batch_losses, counter, batch_accuracy = 0, 0, 0 recall, precision, val_recall, val_precision = 0, 0, 0, 0 epoch_losses, epoch_counter, epoch_accuracy = 0, 0, 0 for data in dataloader: """ Training """ net.train() targets = data['training']['labels'] inputs = data['training']['sequence'] for k in num_batches: """ Get truncated steps """ x, t = get_chunks(inputs, targets, k, mini_batch) if torch.sum(t) > 0 or np.random.uniform(0, 1) < 0.1: outputs = net(x) pred = torch.max(outputs, dim=1)[1].data.numpy()[0, :] ground = t.data.numpy()[0, :] optimizer.zero_grad() loss = loss_fn(outputs, t) loss.backward() optimizer.step() batch_losses += loss.data.numpy() batch_accuracy += f1_score(ground, pred, average='macro') recall += recall_score(ground, pred, average='macro')