def test(model_input, labels, model, loss_fn=None, batch_size=32): """ Args: model_input: list of tuples containing input to model labels: list of tuples containing labels corresponding to model input for training model: (torch.nn.Module) the neural network loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch batch_size: maximum batch_size Returns: metrics: dict """ metrics = {} for batch_input, batch_labels in zip(grouper(model_input, batch_size), grouper(labels, batch_size)): batch_input = list( filter(lambda x: x is not None, batch_input)) # remove None objects introduced by grouper batch_labels = list( filter(lambda x: x is not None, batch_labels)) # remove None objects introduced by grouper batch_metrics = test_batch(batch_input, batch_labels, model, loss_fn=loss_fn) add_dict(metrics, batch_metrics) return metrics
def test_grouper(self): self.assertEqual( [list(g) for g in utils.grouper(python_utils.RANGE(7), 3)], [[0, 1, 2], [3, 4, 5], [6, None, None]]) # Returns an iterable of iterables, so we need to combine them into # strings for easier comparison. self.assertEqual( [''.join(g) for g in utils.grouper('ABCDEFG', 3, fillvalue='x')], ['ABC', 'DEF', 'Gxx'])
def test_epoch_in_batches(self, batch_size): test_list = list(range(len(self.og.test.images))) np.random.shuffle(test_list) for batch_i in grouper(test_list, batch_size): batch = [(self.og.test.images[i], self.og.test.labels[i]) for i in batch_i if i is not None] yield zip(*batch)
def main(args): global DEBUG if len(args) == 1: # no args - repl while True: print 'que?>', try: print google_it(raw_input()) except EOFError: break except: import traceback traceback.print_exc() else: # test mode DEBUG = False print 'Loading testfile...' tests = filter(bool, open(args[1]).read().split('\n')) print len(tests), 'tests' for clue, answer in utils.grouper(2, tests): clue = clue.split('~!clue')[1] answer = answer.split("~!answer")[1] try: print '----------------------------------------------------------------' print 'clue:', clue print 'correct:', answer print 'eubank:', google_it(clue) except KeyboardInterrupt: sys.exit(0) except: import traceback traceback.print_exc()
def main(): description = 'Split a FASTA file into multiple subfiles.' parser = ArgumentParser(description=description, parents=[get_default_argument_parser()]) parser.add_argument('-f', '--in-format', default=_DEFAULT_FMT, help="A biopython file format string.") parser.add_argument('-n', '--num-files', type=int, default=_DEFAULT_N, help=("The number of splits. " "DEFAULT=%d") % _DEFAULT_N) parser.add_argument('in_path', nargs='?', default=None, help=("The path of the file to be read in. " "If no argument given, reads from STDIN.")) parser.add_argument('out_pattern', default=None, help=("Output file names format string. " "Must contain one '%%d' for the file number.")) args = parser.parse_args() if args.in_path is None: record_parser = SeqIO.parse(sys.stdin, args.in_format) else: record_parser = SeqIO.parse(args.in_path, args.in_format) write_multithread(grouper(record_parser, 100), lambda recs, handle: SeqIO.write(recs, handle, args.in_format), args.out_pattern, n=args.num_files)
def generate_predictions(model, lemmas, tags, batch_size=32): """Returns predicted inflected forms for given lemmas and tags.""" lemmas_indices = model.vocab.words_to_indices(lemmas, start_char=True, stop_char=True) tags_indices = model.vocab.tag_to_indices(tags) model_input = list(zip(lemmas_indices, tags_indices)) predictions = [] for batch_input in grouper(model_input, batch_size): batch_input = list( filter(lambda x: x is not None, batch_input)) # remove None objects introduced by grouper # set model to evaluating mode model.eval() # compute model output and loss p_ws, a_ls, p_gens = model(*zip(*batch_input)) batch_predictions = [ word.split(model.vocab.STOP_CHAR)[0] for word in model.vocab.indices_to_word(p_ws.argmax(2)) ] predictions += batch_predictions return predictions
def insert(rows): """Insert/Bulk insert values into the table. Parameter -------- rows : str A long string equal to the number of columns in the database setup. Each column value is separated by a comma and or by delineating each row with a bracket. """ # TODO Try to handle special characters that are difficult global no_cols if no_cols is None: no_cols = len(get_one()[0]) rd = csv.DictReader(io.StringIO(rows)) try: # TODO Figure out what errors could occur dta = [item.rstrip(")").lstrip(" (") for item in rd.fieldnames] data = list(utils.grouper(no_cols, dta)) fields = ("?, " * no_cols).rstrip(", ") command = "INSERT INTO t1 VALUES (%s)" % fields db.executemany(command, data) except: raise db.commit() return "Successfully inserted %s" % rows
def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ logger.info("training model on %i vocabulary and %i features" % (len(self.vocab), self.layer1_size)) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), 1.0 if not total_words: total_words = sum(v.count for v in self.vocab.itervalues()) # convert input string lists to Vocab objects (or None for OOV words) no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) # run in chunks of e.g. 100 sentences (= 1 job) for job in utils.grouper(no_oov, chunksize): # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = sum(train_sentences(self, sentence, alpha) for sentence in job) word_count += job_words # report progress elapsed = time.time() - start if elapsed >= next_report: logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count / total_words, alpha, word_count / elapsed if elapsed else 0.0)) next_report = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count, elapsed, word_count / elapsed if elapsed else 0.0)) return word_count
def main(): logger = configure_logging('parse_serverstatus') client = InfluxDBClient(host=args.influxdb_host, ssl=args.ssl, verify_ssl=False, port=8086, database=args.database) with open(args.input_file, 'r') as f: for line_number, chunk in enumerate(grouper(f, args.batch_size)): # print(line_number) json_points = [] for line in chunk: # zip_longest will backfill any missing values with None, so we need to handle this, otherwise we'll miss the last batch if line: try: server_status_json = json.loads(line) # print((line_number + 0) * _BATCH_SIZE) # print((line_number + 1) * _BATCH_SIZE) common_metric_data = get_metrics("serverstatus", server_status_json, common_metrics, line_number) json_points.append(create_point(*common_metric_data)) wiredtiger_metric_data = get_metrics("serverstatus_wiredtiger", server_status_json, wiredtiger_metrics, line_number) json_points.append(create_point(*wiredtiger_metric_data)) # for metric_data in get_metrics(server_status_json, common_metrics, line_number): # import ipdb; ipdb.set_trace() # print(json_points) # json_points.append(create_point(*metric_data)) # # for metric in get_metrics(server_status_json, wiredtiger_metrics, line_number): # json_points.append(create_point(*metric)) # for metric in get_metrics(server_status_json, mmapv1_metrics, line_number): # json_points.append(create_point(*metric)) except ValueError: logger.error("Line {} does not appear to be valid JSON - \"{}\"".format(line_number, line.strip())) write_points(logger, client, json_points, line_number)
def get_context(title, word_to_synset, nasari): context = [] # Spezziamo il titolo per evitare di generare troppe combinazioni di sensi # Il secondo parametro indica il numero di parole del titolo da tenere in considerazione per determinare il contesto # Questo numero si può cambiare for chunk in utils.grouper(title, 6): print("chunk", chunk) babel_ids = get_babel_ids(chunk, word_to_synset) # Possibili combinazioni di sensi attraverso il prodotto cartesiano lista_ids = list(itertools.product(*babel_ids)) print("lunghezza combinazioni", len(lista_ids)) max_sim_tup = 0 for word in chunk: # Inizializziamo la tupla migliore con i primi significati che esistono in Nasari best_tup_ids = get_vector(word, word_to_synset, nasari) for tuple_ids in lista_ids: # DUE METODI PER MISURARE LA SIMILARITA' # sim_tup = similarity_tuple_intersection(tuple_ids, nasari) sim_tup = similarity_tuple(tuple_ids, nasari) if sim_tup > max_sim_tup: max_sim_tup = sim_tup best_tup_ids = tuple_ids # Costruisce il contesto del chunk attuale for best_id in best_tup_ids: vect = nasari.get( best_id ) # Estraiamo da Nasari il vettore dei migliori significati if vect is not None: context.append(vect) return clean_context(context)
def _unblock(self, bot, update): message = update.message if message.from_user.id != self._admin_id and not is_user_group_admin(bot, message.from_user.id, message.chat_id, self._admin_id): message.reply_text(text=self._ADMIN_RESTRICTION_MESSAGE, quote=False) return blocked_stickerpacks = self._get_blocked_stickerpacks() packs_list = [] buttons = [] if blocked_stickerpacks: for index, stickerpack in enumerate(blocked_stickerpacks, start=1): packs_list.append(f'{index}. [{stickerpack.name}]({self._get_stickers_link(stickerpack.name)})') buttons.append( InlineKeyboardButton( text=str(index), callback_data=set_callback_data(stickerpack.id)) ) response_text = '*Заблокированные стикерпаки:*\n{}\n\nКакой *разблокировать*?'.format("\n".join(packs_list)) keyboard = grouper(buttons, 5) # в одном ряду будет 5 кнопок, так как текст на каждой из них короткий reply_markup = InlineKeyboardMarkup(keyboard, one_time_keyboard=True) else: response_text = self._NO_STICKERPACKS_BLOCKED_MESSAGE reply_markup = None message.reply_text(text=response_text, parse_mode=ParseMode.MARKDOWN, reply_markup=reply_markup, quote=False)
def compute_descriptors(infile, descriptor_types): """Reads low-level descriptors from DenseTracks.""" LEN_LINE = 436 POS_IDXS = [1, 2, 0] # Position coordinates (X, Y, T). NORM_POS_IDXS = [7, 8, 9] # Normalized position coordinates (X, Y, T). dense_tracks = subprocess.Popen( [DENSE_TRACK, infile], stdout=subprocess.PIPE) for lines in grouper(dense_tracks.stdout, NR_DESCRIPTORS): all_descs = np.vstack([ map(float, line.split()) for line in lines if line is not None] ).astype(np.float32) assert all_descs.shape[0] <= NR_DESCRIPTORS assert all_descs.shape[1] == LEN_LINE positions = all_descs[:, POS_IDXS] normalized_positions = all_descs[:, NORM_POS_IDXS] descriptors = { desc_type: all_descs[:, DESC_IDXS[desc_type]] for desc_type in descriptor_types} yield positions, normalized_positions, descriptors
def __iter__(self): if self.chunksize: for chunk in utils.grouper(self.corpus, self.chunksize): for transformed in self.obj.__getitem__(chunk, chunksize=None): yield transformed else: for doc in self.corpus: yield self.obj[doc]
def test_epoch_in_batches(self, batch_size): test_list = list(range(len(self.og['test']['images']))) np.random.shuffle(test_list) for batch_i in grouper(test_list, batch_size): batch = [(self.read_preprocess(self.og['test']['images'][i]), self.og['test']['labels'][i]) for i in batch_i if i is not None] yield zip(*batch)
def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ logger.info("training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size)) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [word_count], total_words or sum(v.count for v in self.vocab.itervalues()) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # each thread must have its own work memory while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = sum(train_sentence(self, sentence, alpha, work) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]
def test_epoch_in_batches(self, batch_size): test_list = list(range(len(self.og.test.images))) np.random.shuffle(test_list) for batch_i in grouper(test_list, batch_size): batch = [(np.reshape( resize(np.reshape(self.og.test.images[i], (28, 28)), (32, 32), mode='constant'), (1024, )), self.og.test.labels[i]) for i in batch_i if i is not None] yield zip(*batch)
def import_json(): for g in grouper(1000,sys.stdin): try: Model.database.bulk_save([json.loads(l) for l in g if l]) except BulkSaveError as err: if any(d['error']!='conflict' for d in err.errors): raise else: logging.warn("conflicts for %r",[d['id'] for d in err.errors])
def reduce(key, values): # The reduce() function must be static, so we manually create a "cls" # variable instead of changing the function into a classmethod. cls = PopulateFirebaseAccountsOneOffJob if key == cls.POPULATED_KEY: yield (cls.AUDIT_KEY, len(values)) return elif key in (cls.SUPER_ADMIN_ACK, cls.SYSTEM_COMMITTER_ACK): yield (key, values) return # NOTE: This is only sorted to make unit testing easier. user_fields = sorted(ast.literal_eval(v) for v in values) user_records = [ firebase_auth.ImportUserRecord( uid=auth_id, email=email, email_verified=True, custom_claims=('{"role":"%s"}' % feconf.FIREBASE_ROLE_SUPER_ADMIN if user_is_super_admin else None)) for auth_id, _, email, user_is_super_admin in user_fields ] # The Firebase Admin SDK places a hard-limit on the number of users that # can be "imported" in a single call. To compensate, we break up the # users into chunks. offsets = python_utils.RANGE( 0, len(user_records), cls.MAX_USERS_FIREBASE_CAN_IMPORT_PER_CALL) results = (cls.populate_firebase( [r for r in record_group if r is not None]) for record_group in utils.grouper( user_records, cls.MAX_USERS_FIREBASE_CAN_IMPORT_PER_CALL)) assocs_to_create = [] for offset, (result, exception) in python_utils.ZIP(offsets, results): if exception is not None: yield (cls.ERROR_KEY, repr(exception)) else: successful_indices = set( python_utils.RANGE(result.success_count + result.failure_count)) for error in result.errors: successful_indices.remove(error.index) debug_info = 'Import user_id=%r failed: %s' % ( user_fields[offset + error.index][1], error.reason) yield (cls.ERROR_KEY, debug_info) assocs_to_create.extend( auth_domain.AuthIdUserIdPair(*user_fields[offset + i][:2]) for i in successful_indices) if assocs_to_create: firebase_auth_services.associate_multi_auth_ids_with_user_ids( assocs_to_create) yield (cls.SUCCESS_KEY, len(assocs_to_create))
def row_batch_iter(rows, min_size, n): if cfg.group_length: rows.sort(key=lambda row: len(row[0])) csv_batches = list(utils.grouper(cfg.batch_size, rows, None)) random.shuffle(csv_batches) for i in range(n): for batch in csv_batches: if is_batch_valid(batch): yield pack(batch, min_size)
def read_slr(fh): stats = fh.readline() seqs = [] for l in utils.grouper(fh, 2): name = l[0].rstrip() seq = l[1].rstrip() seqs.append(SeqRecord(id=name, seq=Seq(seq), description="")) return seqs
def retrieve_nodes_given_sentences(out_fname, batch_size, all_input_sentences, glosses_bnids, glosses_feats, topk): """ out_fname(str): Output file to write retrieved node ids to. batch_size(int): Batch size for Sentence BERT. all_input_sentences(list[str]): All input sentences loaded from `input_file`. glosses_bnids(list[str]): All gloss BNids loaded from `args.glosses_bnids`. Aligned with `glosses_feats`. glosses_feats(numpy.array): Numpy array with VisualSem gloss features computed with Sentence BERT. topk(int): Number of nodes to retrieve for each input sentence. """ if os.path.isfile(out_fname): raise Exception( "File already exists: '%s'. Please remove it manually to avoid tampering." % out_fname) n_examples = len(all_input_sentences) print("Number of input examples to extract BNIDs for: ", n_examples) model = SentenceTransformer('distiluse-base-multilingual-cased') with open(out_fname, 'w', encoding='utf8') as fh_out: ranks_predicted = [] for idxs_ in grouper(batch_size, range(n_examples)): idxs = [] queries = [] for i in idxs_: if not i is None: idxs.append(i) queries.append(all_input_sentences[i]) queries_embs = model.encode(queries, convert_to_tensor=True) queries_embs = queries_embs.cuda() scores = util.pytorch_cos_sim(queries_embs, glosses_feats) scores = scores.cpu().numpy() ranks = numpy.argsort( scores) # sort scores by cosine similarity (low to high) ranks = ranks[:, ::-1] # sort by cosine similarity (high to low) for rank_idx in range(len(idxs[:ranks.shape[0]])): bnids_predicted = [] for rank_predicted in range(topk * 10): bnid_pred = glosses_bnids[ranks[rank_idx, rank_predicted]] bnid_pred_score = scores[rank_idx, ranks[rank_idx, rank_predicted]] if not bnid_pred in bnids_predicted: bnids_predicted.append((bnid_pred, bnid_pred_score)) if len(bnids_predicted) >= topk: break # write top-k predicted BNids for iii, (bnid, score) in enumerate(bnids_predicted[:topk]): fh_out.write(bnid + "\t" + "%.4f" % score) if iii < topk - 1: fh_out.write("\t") else: # iii == topk-1 fh_out.write("\n")
def __init__(self, horn_pointing=False, siamfile=None): self.horn_pointing = horn_pointing if siamfile is None: siamfile = private.siam f = open(siamfile) lines = f.readlines() self.siam = {} for line in grouper(4,lines[1:]): chtag = line[0].split()[0] m = np.array(np.matrix(';'.join(line[1:]))) self.siam[chtag] = m
def buffered_read(self, fnames): '''Read packed batches from data with each batch having lines of similar lengths''' for line_collection in self.buffered_read_sorted_lines(fnames): batches = [ b for b in utils.grouper(cfg.batch_size, line_collection) ] random.shuffle(batches) for batch in batches: ret = self.pack(batch) if ret is not None: yield ret
def import_old_json(): for g in grouper(1000,sys.stdin): docs = [json.loads(l) for l in g if l] for d in docs: del d['doc_type'] for k,v in d.iteritems(): if k[-2:]=='id' or k in ('rtt','rtu'): d[k]=v[1:] for field in ['ats','fols','frs']: if field in d and isinstance(d[field],list): d[field] = [u[1:] for u in d[field]] Model.database.bulk_save(docs)
def join(paths, output_path, batch_size=100): ''' Stitch a bunch of chunks into a single file ''' incomplete_output_path = f'{output_path}.incomplete' with open(incomplete_output_path, 'wt') as output_file: try: # Concatenate a batch of files at a time, in case the file list is too long for batch in grouper(paths, batch_size): subprocess.check_call(['cat'] + batch, stdout=output_file, stderr=subprocess.PIPE) except subprocess.CalledProcessError: raise RuntimeError(f'Unable to join files into {output_path}') os.rename(incomplete_output_path, output_path)
def xfory(price_info, units): """ function to discount per groups. if you pay Y you get X """ total = 0 x = price_info.get('x') y = price_info.get('y') price = price_info.get('unitPrice') for group in grouper(x, range(0, units)): has_discount = len(group) == x per_unit = price if not has_discount else y / x * price total = total + (per_unit * len(group)) return total / units
def test(net, img, hyperparams): """ Test a model on a specific image """ net.eval() patch_size = hyperparams['patch_size'] center_pixel = hyperparams['center_pixel'] batch_size, device = hyperparams['batch_size'], hyperparams['device'] n_classes = hyperparams['n_classes'] kwargs = { 'step': hyperparams['test_stride'], 'window_size': (patch_size, patch_size) } probs = np.zeros(img.shape[:2] + (n_classes, )) iterations = count_sliding_window(img, **kwargs) // batch_size for batch in tqdm(grouper(batch_size, sliding_window(img, **kwargs)), total=(iterations), desc="Inference on the image"): with torch.no_grad(): if patch_size == 1: data = [b[0][0, 0] for b in batch] data = np.copy(data) data = torch.from_numpy(data) else: data = [b[0] for b in batch] data = np.copy(data) data = data.transpose(0, 3, 1, 2) data = torch.from_numpy(data) # data = data.unsqueeze(1) # 3DConv时执行 indices = [b[1:] for b in batch] data = data.to(device) output = net(data) if isinstance(output, tuple): output = output[0] output = output.to('cpu') # 将cpu 改为 cuda if patch_size == 1 or center_pixel: output = output.numpy() else: output = np.transpose(output.numpy(), (0, 2, 3, 1)) for (x, y, w, h), out in zip(indices, output): if center_pixel: # probs[x, y] += out probs[x + w // 2, y + h // 2] += out # probs[x:x + w, y:y + h] += out else: probs[x:x + w, y:y + h] += out return probs
def test(net, img, hyperparams): """ Test a model on a specific image """ net.eval() patch_size = hyperparams["patch_size"] center_pixel = hyperparams["center_pixel"] batch_size, device = hyperparams["batch_size"], hyperparams["device"] n_classes = hyperparams["n_classes"] kwargs = { "step": hyperparams["test_stride"], "window_size": (patch_size, patch_size), } probs = np.zeros(img.shape[:2] + (n_classes,)) iterations = count_sliding_window(img, **kwargs) // batch_size for batch in tqdm( grouper(batch_size, sliding_window(img, **kwargs)), total=(iterations), desc="Inference on the image", ): with torch.no_grad(): if patch_size == 1: data = [b[0][0, 0] for b in batch] data = np.copy(data) data = torch.from_numpy(data) else: data = [b[0] for b in batch] data = np.copy(data) data = data.transpose(0, 3, 1, 2) data = torch.from_numpy(data) data = data.unsqueeze(1) indices = [b[1:] for b in batch] data = data.to(device) output = net(data) if isinstance(output, tuple): output = output[0] output = output.to("cpu") if patch_size == 1 or center_pixel: output = output.numpy() else: output = np.transpose(output.numpy(), (0, 2, 3, 1)) for (x, y, w, h), out in zip(indices, output): if center_pixel: probs[x + w // 2, y + h // 2] += out else: probs[x : x + w, y : y + h] += out return probs
def insert_blat_hits_into_db(self, blat_output, hits_per_chunk=50000): """ Insert peptide hits from BLAT into db. """ for values in grouper(hits_per_chunk, blat_output): self.db.executemany( "INSERT INTO mappings VALUES (?, ?, ?, ?, ?, ?)", values) self.db.execute( "INSERT INTO peptides (peptide) SELECT DISTINCT peptide FROM mappings" ) self.db.execute( "CREATE INDEX i_peptides_disc ON peptides(discriminative_taxid)") self.db.execute("CREATE INDEX i_mappings_targets ON mappings(target)") self.db.commit()
def import_fasta(fasta_file, tfhost, tfpath): tfserver = 'http://{}{}'.format(tfhost, tfpath) seqiter = read_sequences(fasta_file) for batch in grouper(seqiter): ids, seqs = zip(*batch) preds = infer_batch(seqs, tfserver) for i, s, p in zip(ids, seqs, preds): p['id'] = i p['seq'] = s print(p) return
def test(net, img, args): """ Test a model on a specific image """ net.eval() patch_size = args.patch_size center_pixel = args.center_pixel batch_size, device = args.batch_size, torch.device(args.device) n_classes = args.n_classes kwargs = { 'step': args.test_stride, 'window_size': (patch_size, patch_size) } probs = np.zeros(img.shape[:2] + (n_classes, )) iterations = utils.count_sliding_window(img, **kwargs) // batch_size for batch in tqdm(utils.grouper(batch_size, utils.sliding_window(img, **kwargs)), total=(iterations), desc="Inference on the image"): with torch.no_grad(): if patch_size == 1: data = [b[0][0, 0] for b in batch] data = np.copy(data) data = torch.from_numpy(data) else: data = [b[0] for b in batch] data = np.copy(data) data = data.transpose(0, 3, 1, 2) data = torch.from_numpy(data) data = data.unsqueeze(1) indices = [b[1:] for b in batch] data = data.to(device) output = net(data) if isinstance(output, tuple): output = output[0] output = output.to('cpu') if patch_size == 1 or center_pixel: output = output.numpy() else: output = np.transpose(output.numpy(), (0, 2, 3, 1)) for (x, y, w, h), out in zip(indices, output): if center_pixel: probs[x + w // 2, y + h // 2] += out else: probs[x:x + w, y:y + h] += out return probs
def set_item_candidates(self, n_user, n_item, train_data, eval_data, path_list_dict): """Construct the sampling distrbiutions for negative/pseudo-labelled instances for each user """ all_users = tuple(set(train_data[:, 0])) self.all_users = all_users self.n_item = n_item self.all_items = set(range(n_item)) self.neg_c_dict_user = self._build_freq_dict( np.concatenate([train_data[:, 0], eval_data[:, 0]]), self.all_users) self.neg_c_dict_item = self._build_freq_dict( np.concatenate([train_data[:, 1], eval_data[:, 1]]), self.all_items) item_cands = tuple(self.neg_c_dict_item.keys()) F = np.array(tuple( self.neg_c_dict_item.values()))**self.cfg.plabel.neg_pn sort_inds = np.argsort(F) item_cands = [item_cands[i] for i in sort_inds] F = F[sort_inds] F = (F / F.sum()).cumsum() self.item_freq = (item_cands, F) for u, i in tqdm(train_data[:, 0:2]): self.user_seed_dict[u].add(i) path = hydra.utils.to_absolute_path(self.cfg.reachable_items_path) logger.info("calculating reachable items for users") self._setup_dst_dict(path_list_dict) item_dist_dict = {} src_itr = map( lambda iu: ( all_users[iu], tuple(self.user_seed_dict[all_users[iu]]), self.dst_dict, self.neg_c_dict_item, self.cfg.plabel.pl_pn, ), range(len(all_users)), ) grouped = grouper(self.cfg.plabel.chunk_size, src_itr, squash=set([2, 3])) with mp.Pool(self.cfg.plabel.par) as pool: for idd in pool.imap_unordered(compute_reachable_items_, grouped): item_dist_dict.update(idd) self.item_dist_dict = item_dist_dict
def command_service(self, rawCommand): """ Parse raw input and execute specified function with args :param rawCommand: csv string from Matlab/Simulink of the form: 'command, namedArg1, arg1, namedArg2, arg2, ..., namedArgN, argN' :return: the command and arguments as a dictionary """ pack = [x.strip() for x in split('[,()]*', rawCommand.strip())] raw_cmd = pack[0] argDict = {key: literal_eval(value) for key, value in utils.grouper(pack[1:], 2)} cmd = self.mapInterface.commands[raw_cmd] ret = cmd(**argDict) logger.info("Command '{}' run with args {}".format(raw_cmd, argDict)) return raw_cmd, ret
def train(self,triples, total_triples=None, triples_count = 0, chunksize=1000): if not self.vocab or not self.vocab_rel: raise RuntimeError("you must first build entity and relation vocabulary before training the model") start,next_report = time.time(),[1.0] triples_count = [triples_count] total_triples = total_triples or int(sum(1 for v in triples)) jobs = Queue(maxsize=2*self.workers) lock = threading.Lock() def worker_train(): work = zeros(self.layer1_size, dtype=REAL) detR = zeros((self.layer1_size,self.layer1_size),dtype=REAL) # neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: break alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * triples_count[0] / total_triples)) job_triples = self._get_job_triples(alpha,job,work,detR) with lock: triples_count[0] += job_triples elapsed = time.time() - start if elapsed>= next_report[0]: logger.info("PROGRESS: at %.2f%% triplrs, alpha %.05f, %.0f triples/s" % (100.0 * triples_count[0] / total_triples, alpha, triples_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate(utils.grouper(self._prepare_triples(triples), chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i triples took %.1fs, %.0f triples/s" % (triples_count[0], elapsed, triples_count[0] / elapsed if elapsed else 0.0)) self.syn0norm = None return triples_count[0]
def split_batch_by_box_num(batches, box_batch_size): batchIdxs, batch_datas = batches newdata = [] num_gpu = len( batch_datas ) # each is a Dataset instance, d.data['img'] is a one item list num_boxes = [ batch_datas[i].data['gt'][0]['boxes'].shape[0] for i in xrange(num_gpu) ] max_num_box = max(num_boxes) min_num_box = min(num_boxes) split_into_num_batch = int(math.ceil(max_num_box / float(box_batch_size))) # the indexes for each inner batch # the batch with not enough will fill with 0, the first box each_batch_selected_indexes = [ grouper(range(num_boxes[i]), box_batch_size, fillvalue=0) for i in xrange(num_gpu) ] # still need to handle some batch has not enough batch t2 = [] for b in each_batch_selected_indexes: if len(b) < split_into_num_batch: need = split_into_num_batch - len(b) b = b + [[0 for _ in xrange(box_batch_size)] for _ in xrange(need)] t2.append(b) for i in xrange(split_into_num_batch): this_datas = [] for j in xrange(num_gpu): selected = each_batch_selected_indexes[j][i] temp = { "imgs": [batch_datas[j].data['imgs'][0]], "imgdata": [batch_datas[j].data['imgdata'][0]], "resized_image": [batch_datas[j].data['resized_image'][0]], 'gt': [{ "boxes": batch_datas[j].data['gt'][0]['boxes'][selected, :], #"labels": batch_datas[j].data['gt'][0]['labels'][selected], }], } this_datas.append(temp) newdata.append( (batchIdxs, [Dataset(this_data) for this_data in this_datas])) return newdata
def build_map(left_edge: Tile, top_edge: Tile) -> list[list[Tile]]: rows = [top_edge] for row_index, row in enumerate(left_edge[1:], 1): rows.append([row]) prev_row = rows[row_index - 1] current: Tile = row for above, next_above in grouper(prev_row, 2): current = next(i for i in current.connections if i != above and any(j == next_above for j in i.connections)) rows[row_index].append(current) return rows
def main(reset=True): """ Get a database and table Generate rows Insert chunks """ db = dataset.connect(DATABASE_URL) if reset: db[TABLE_NAME].drop() table = db[TABLE_NAME] rows = generate_rows(*FILES) for group in grouper(rows, 1000, None): group = ifilter(bool, group) table.insert_many(group, types=TYPES)
def decodeNetwork(encoding): layers = [] for layer_tuple in utils.grouper(4, encoding): filter_widths = [1, 3, 5, 7] filter_heights = [1, 3, 5, 7] num_filters = [24, 36, 48, 64] strides = [1, 2, 3, 1] filter_widths_i, filter_heights_i, num_filters_i, strides_i = layer_tuple filter_width = filter_widths[filter_widths_i] filter_height = filter_heights[filter_heights_i] num_filter = num_filters[num_filters_i] stride = strides[strides_i] layers.append(ConvolutionalLayer(kernel_size=[filter_width, filter_height], stride=[stride, stride], num_filters=num_filter)) return layers
def read_and_translate(translator: inference.Translator, output_handler: output_handler.OutputHandler, chunk_size: Optional[int], source: Optional[str] = None, reference: Optional[str] = None, dictionary: Optional[dict] = None) -> None: """ Reads from either a file or stdin and translates each line, calling the output_handler with the result. :param output_handler: Handler that will write output to a stream. :param translator: Translator that will translate each line of input. :param chunk_size: The size of the portion to read at a time from the input. :param source: Path to file which will be translated line-by-line if included, if none use stdin. :param reference: Path to reference file. :param dictionary: dictionary to constrain translation. """ source_data = sys.stdin if source is None else data_io.smart_open(source) reference_data = None if reference is None else data_io.smart_open(reference) batch_size = translator.batch_size if chunk_size is None: if translator.batch_size == 1: # No batching, therefore there is not need to read segments in chunks. chunk_size = C.CHUNK_SIZE_NO_BATCHING else: # Get a constant number of batches per call to Translator.translate. chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size else: if chunk_size < translator.batch_size: logger.warning("You specified a chunk size (%d) smaller than the batch size (%d). This will lead to " "a degregation of translation speed. Consider choosing a larger chunk size." % (chunk_size, batch_size)) logger.info("Translating...") total_time, total_lines = 0.0, 0 for chunk, reference_chunk in itertools.zip_longest(grouper(source_data, chunk_size), grouper(reference_data, chunk_size) if reference_data is not None else [None]): chunk_time = translate(output_handler, chunk, translator, total_lines, reference_chunk) total_lines += len(chunk) total_time += chunk_time if total_lines != 0: logger.info("Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f", total_lines, ceil(total_lines / batch_size), total_time, total_time / total_lines, total_lines / total_time) else: logger.info("Processed 0 lines.")
def get(self): if not self.is_running(): self.start() try: while self.is_running(): if self.cur_batch_count == self.dataset.num_batches: self._stop() return samples = [] for i in range(self.dataset.batch_size): # first get got the ApplyResult object, # then second get to get the actual thing (block till get) sample = self.queue.get(block=True).get() self.queue.task_done() samples.append(sample) # break the mini-batch into mini-batches for multi-gpu if self.is_multi_gpu: # a list of [frames, boxes, labels_arr, ori_boxes, box_keys] batches = [] this_batch_idxs = range(len(samples)) # pack these batches for each gpu this_batch_idxs_gpus = utils.grouper( this_batch_idxs, self.dataset.batch_size_per_gpu) batches = [] for this_batch_idxs_per_gpu in this_batch_idxs_gpus: batches.append(self.dataset.collect_batch( samples, this_batch_idxs_per_gpu)) batch = batches else: batch = self.dataset.collect_batch(samples) self.cur_batch_count += 1 yield batch except Exception as e: # pylint: disable=broad-except self._stop() _type, _value, _traceback = sys.exc_info() print("Exception in enqueuer.get: %s" % e) traceback.print_tb(_traceback) raise Exception
def fetch_edges(): Edges.database = connect("houtx_edges") User.database = connect("away_user") old_edges = set(int(row['id']) for row in Edges.database.paged_view("_all_docs",endkey="_")) uids = set(_users_from_scores())-old_edges settings.pdb() for g in grouper(100,uids): for user in twitter.user_lookup(g): if user is None or user.protected: continue try: edges = twitter.get_edges(user._id) except restkit.errors.Unauthorized: logging.warn("unauthorized!") continue except restkit.errors.ResourceNotFound: logging.warn("resource not found!?") continue edges.save() user.save() sleep_if_needed()
def compute_descriptors(infile, descriptor_type): """Reads low-level descriptors from DenseTracks.""" LEN_LINE = 436 POS_IDXS = [1, 2, 0] # Positional coordinates (X, Y, T). dense_tracks = subprocess.Popen( ['./DenseTrack', infile], stdout=subprocess.PIPE) descriptor_idxs = DESC_IDXS[descriptor_type] for lines in grouper(dense_tracks.stdout, NR_DESCRIPTORS): all_descs = np.vstack([ map(float, line.split()) for line in lines if line is not None] ).astype(np.float32) assert all_descs.shape[0] <= NR_DESCRIPTORS assert all_descs.shape[1] == LEN_LINE yield all_descs[:, POS_IDXS], all_descs[:, descriptor_idxs]
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=1024) parser.add_argument('-num_hidden_layers', type=int, default=3) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation', type=str, default='tanh') parser.add_argument('-language_only', type=bool, default= False) parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=10) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-word_vector', type=str, default='') args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' maxAnswers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, maxAnswers) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,'../models/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() id_map = {} for ids in image_ids: id_split = ids.split() id_map[id_split[0]] = int(id_split[1]) # Code to choose the word vectors, default is Goldberg but GLOVE is preferred if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' img_dim = 4096 word_vec_dim = 300 model = Sequential() if args.language_only: model.add(Dense(args.num_hidden_units, input_dim=word_vec_dim, init='uniform')) else: model.add(Dense(args.num_hidden_units, input_dim=img_dim+word_vec_dim, init='uniform')) model.add(Activation(args.activation)) if args.dropout>0: model.add(Dropout(args.dropout)) for i in xrange(args.num_hidden_layers-1): model.add(Dense(args.num_hidden_units, init='uniform')) model.add(Activation(args.activation)) if args.dropout>0: model.add(Dropout(args.dropout)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() if args.language_only: model_file_name = '../models/mlp_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) else: model_file_name = '../models/mlp_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done...' print 'Training started...' for k in xrange(args.num_epochs): #shuffle the data points before going through them index_shuf = range(len(questions_train)) shuffle(index_shuf) questions_train = [questions_train[i] for i in index_shuf] answers_train = [answers_train[i] for i in index_shuf] images_train = [images_train[i] for i in index_shuf] progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): X_q_batch = get_questions_matrix_sum(qu_batch, nlp) if args.language_only: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, id_map, VGGfeatures) X_batch = np.hstack((X_q_batch, X_i_batch)) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(X_batch, Y_batch) # fix for the Keras v0.3 issue #9 progbar.add(args.batch_size, values=[("train loss", loss[0])]) #print type(loss) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units_mlp', type=int, default=1024) parser.add_argument('-num_hidden_units_lstm', type=int, default=512) parser.add_argument('-num_hidden_layers_mlp', type=int, default=3) parser.add_argument('-num_hidden_layers_lstm', type=int, default=1) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation_mlp', type=str, default='tanh') parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=5) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-word_vector', type=str, default='') #TODO Feature parser.add_argument('-resume_training', type=str) #TODO Feature parser.add_argument('-language_only', type=bool, default= False) args = parser.parse_args() word_vec_dim= 300 img_dim = 4096 max_len = 30 nb_classes = 1000 #get the data questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' max_answers = nb_classes questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers) questions_lengths_train, questions_train, answers_train, images_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train, images_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,'../models/labelencoder.pkl') image_model = Sequential() image_model.add(Reshape(input_shape = (img_dim,), dims=(img_dim,))) language_model = Sequential() if args.num_hidden_layers_lstm == 1: language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False, input_shape=(max_len, word_vec_dim))) else: language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True, input_shape=(max_len, word_vec_dim))) for i in xrange(args.num_hidden_layers_lstm-2): language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True)) language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False)) model = Sequential() model.add(Merge([language_model, image_model], mode='concat', concat_axis=1)) for i in xrange(args.num_hidden_layers_mlp): model.add(Dense(args.num_hidden_units_mlp, init='uniform')) model.add(Activation(args.activation_mlp)) model.add(Dropout(args.dropout)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_1_num_hidden_units_lstm_' + str(args.num_hidden_units_lstm) + \ '_num_hidden_units_mlp_' + str(args.num_hidden_units_mlp) + '_num_hidden_layers_mlp_' + \ str(args.num_hidden_layers_mlp) + '_num_hidden_layers_lstm_' + str(args.num_hidden_layers_lstm) open(model_file_name + '.json', 'w').write(json_string) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done' features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) # Code to choose the word vectors, default is Goldberg but GLOVE is preferred if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' ## training print 'Training started...' for k in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch([X_q_batch, X_i_batch], Y_batch) # fix for the Keras v0.3 issue #9 progbar.add(args.batch_size, values=[("train loss", loss[0])]) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))
diff += 1.0 if cat_id_gold in nearest else 0.0 print nearest, cat_id_gold confusion_mtx.setdefault(cat_id_gold, {}) confusion_mtx[cat_id_gold].setdefault(nearest[0], 0) confusion_mtx[cat_id_gold][nearest[0]] += 1 qout.put(diff) jobs = Queue(maxsize=50) qout = Queue(maxsize=20000) threads = [Thread(target=worker_infer) for _ in xrange(args.thread)] sent_num = 0 for t in threads: t.daemon = True t.start() for job_no, job in enumerate(utils.grouper(prepare_sentences(), 100)): logger.info("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) sent_num += len(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(args.thread): jobs.put(None) for t in threads: t.join() avg = 0.0 while not qout.empty(): val = qout.get() avg += val avg /= sent_num
def main(): client = InfluxDBClient(host=args.influxdb_host, ssl=args.ssl, verify_ssl=False, port=8086, database=args.database) logger = configure_logging('parse_operations') with open(args.input_file, 'r', encoding="latin-1") as f: line_count = 0 for chunk in grouper(f, args.batch_size): json_points = [] for line in chunk: # zip_longest will backfill any missing values with None, so we need to handle this, otherwise we'll miss the last batch line_count += 1 if line and line.strip().endswith("ms"): values = {} tags = { 'project': args.project, 'hostname': args.hostname, } try: tags['operation'] = line.split("] ", 1)[1].split()[0] except IndexError as e: logger.error("Unable to get operation type - {} - {}".format(e, line)) break if tags['operation'] in ['command', 'query', 'getmore', 'insert', 'update', 'remove', 'aggregate', 'mapreduce']: thread = line.split("[", 1)[1].split("]")[0] # Alternately - print(split_line[3]) if tags['operation'] == 'command': tags['command'] = line.split("command: ")[1].split()[0] if "conn" in thread: tags['connection_id'] = thread split_line = line.split() values['duration_in_milliseconds'] = int(split_line[-1].rstrip('ms')) # TODO 2.4.x timestamps have spaces timestamp = parse(split_line[0]) if split_line[1].startswith("["): # TODO - Parse locks from 2.6 style loglines # 2.4 Logline: tags['namespace'] = split_line[3] for stat in reversed(split_line): if "ms" in stat: pass elif ":" in stat: key, value = stat.split(":", 1) values[key] = int(value) elif stat == "locks(micros)": pass else: break else: # 3.x logline: tags['namespace'] = split_line[5] # TODO - Should we be splitting on "locks:{" instead? pre_locks, locks = line.rsplit("locks:", 1) # Strip duration from locks locks = locks.rsplit(" ", 1)[0] # Add quotation marks around string, so that it is valid JSON locks = re.sub(r"(\w+):", "\"\g<1>\":", locks) locks_document = flatdict.FlatDict(json.loads(locks), delimiter="_") for key, value in locks_document.iteritems(): values["locks_{}".format(key)] = int(value) # We work backwards from the end, until we run out of key:value pairs # TODO - Can we assume these are always integers? for stat in reversed(pre_locks.split()): if ":" in stat: key, value = stat.split(":", 1) values[key] = int(value) else: break # TODO - Parse the full query plan for IXSCAN if 'planSummary: ' in line: tags['plan_summary'] = (line.split('planSummary: ', 1)[1].split()[0]) json_points.append(create_point(timestamp, "operations", values, tags)) else: logger.info("'{}' is not a recognised operation type - not parsing this line ({})".format(tags['operation'], line)) if json_points: # TODO - We shouldn't need to wrap this in try/except - should be handled by retry decorator try: # TODO - Have a dry-run mode write_points(logger, client, json_points, line_count) pass except Exception as e: logger.error("Retries exceeded. Giving up on this point.")
try: with BZ2File(args.dumpfile, 'r') as f: parser = parse_wiki.articles(f) skip = args.skip for i in range(skip): parser.next() time_preproc = 0 time_iserv = 0 last_time = time() articles_count = 0 this_round_count = 0 processed_articles = skip for docgroup in grouper(args.round, parser): t1 = time() bdata = index_pb.BuilderData() round_tokens = set() processed = 0 for doc in docgroup: if not doc: break (title, ns, sha1, text) = doc if ns != '0': continue if not text: continue # wtf if text[:9].lower() == ('#redirect'): continue
#!/usr/bin/env python import sys import utils if __name__ == "__main__": host = sys.argv[1] port = int(sys.argv[2]) collection = sys.argv[3] chunk_size = 10000 added = 0 for lines in utils.grouper(sys.stdin, chunk_size): lines = [x for x in lines if x != None] objects = [utils.parse_line(line) for line in lines] utils.index_objects(objects, host, port, collection) added += chunk_size print >>sys.stderr, added
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=512) parser.add_argument('-num_lstm_layers', type=int, default=2) parser.add_argument('-dropout', type=float, default=0.2) parser.add_argument('-activation', type=str, default='tanh') parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=5) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-word_vector', type=str, default='') args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() max_answers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers) print 'Loaded questions, sorting by length...' questions_lengths_train, questions_train, answers_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,'../models/labelencoder.pkl') max_len = 30 #25 is max for training, 27 is max for validation word_vec_dim = 300 model = Sequential() model.add(LSTM(output_dim = args.num_hidden_units, activation='tanh', return_sequences=True, input_shape=(max_len, word_vec_dim))) model.add(Dropout(args.dropout)) model.add(LSTM(args.num_hidden_units, return_sequences=False)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_lstm_layers_' + str(args.num_lstm_layers) + '_dropout_' + str(args.dropout) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done...' #set up word vectors # Code to choose the word vectors, default is Goldberg but GLOVE is preferred if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' ## training # Moved few variables to args.parser (num_epochs, batch_size, model_save_interval) print 'Training started...' for k in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[0]), grouper(answers_train, args.batch_size, fillvalue=answers_train[0]), grouper(images_train, args.batch_size, fillvalue=images_train[0])): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(X_q_batch, Y_batch) # fix for the Keras v0.3 issue #9 progbar.add(args.batch_size, values=[("train loss", loss[0])]) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k+1))
learnset_url += "{0}_(Pok%C3%A9mon)/Generation_I_learnset".format(pname) html_file = urllib2.urlopen(learnset_url) learnset_html = html_file.read() html_file.close() bs = BeautifulSoup(learnset_html) x = [td.text for td in bs.findAll("td") if 0 < len(td.text) < 60] # if td.text in movename_to_num.values()] worked pretty well, but... # Just grabbing everything that appears anywhere and is a valid move # name will grab Psychic, when those characters only appeared to indicate # the type of a move and not the Move Psychic # So instead, group them into clumps... it seems to group very consistently grouped = list(grouper(x, 6)) # Pikachu had a weird move: Light Screen, which he learns at Level 50 in # Pokemon Yellow, but never in Red/Blue. So just grabbing the values in the # table that are valid moves would actually lead us to believe that Pikachu # can learn Light Screen, which he can, but it doesn't have a TM until Gen # 3. Instead, let's group the entries, drop the ones from Pokemon Yellow, # and then grab the remaining moves not_yellow = [entry for entry in grouped if not entry[0].endswith("Y")] # fix a problem that Vaporean == 106 was having valid_starts = ("T", "H", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9") valid = [entry for entry in not_yellow if entry[0].startswith(valid_starts)] moves = [standardize(entry[1]) for entry in valid
def train(self, sentences, total_words=None, word_count=0, sent_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. """ logger.info("training model with %i workers on %i sentences and %i features, " "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" % (self.workers, self.sents_len, self.layer1_size, self.sg, self.hs, self.sample, self.negative)) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count = [word_count] sent_count = [sent_count] total_words = total_words or sum(v.count * v.sample_probability for v in itervalues(self.vocab)) total_sents = self.total_sents #it's now different from self.sents_len jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size + 8, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job if self.update_mode == 0: alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) else: alpha = self.alpha job_words = sum(train_sent_vec(self, self.sents[sent_no], sentence, alpha, work, neu1, self.sents_grad[sent_no]) for sent_no, sentence in job) with lock: word_count[0] += job_words sent_count[0] += chunksize elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% sents, alpha %.05f, %.0f words/s" % (100.0 * sent_count[0] / total_sents, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() def prepare_sentences(): for sent_tuple in sentences: sentence = sent_tuple[0] sent_id = sent_tuple[1] sent_no = self.sent_no_hash[sent_id] sampled = [self.vocab.get(word, None) for word in sentence if word in self.vocab and (self.vocab[word].sample_probability >= 1.0 or self.vocab[word].sample_probability >= random.random_sample())] yield (sent_no, sampled) # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate(utils.grouper(prepare_sentences(), chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]
from utils import grouper NUM_POKEMON = 151 # Download the html url = r"http://bulbapedia.bulbagarden.net/" url += "wiki/List_of_Pokémon_by_index_number_(Generation_I)" html_file = urllib2.urlopen(url) html = html_file.read() html_file.close() # Parse with BeautifulSoup, grab the types bs = BeautifulSoup(html) x = [td.text for td in bs.findAll("td") if 0 < len(td.text) < 60] pokemon_types = dict() for data in grouper(x, 5): hexx, weirdno, name, type1, type2 = data if "Trainer" in name: break # just eyeballed the data to find this break point if "Missingno" in name: continue # lots of glitch Pokemon, just skip them pokemon_types[name] = (type1, type2) assert len(pokemon_types) == NUM_POKEMON # Merge this with existing Pokemon data basestats = dict() basestats_file = open("base_stats.csv") basestats_file.readline() # skip the header for line in basestats_file: number, name, hp, attack, defense, speed, special = \
for i in range(50): actual *= raiz_doceava_de_dos frecuencias.append(actual) TONO = 2 SEMITONO = 1 escala = [TONO, TONO, SEMITONO, TONO, TONO, TONO, SEMITONO] # Agrandamos la escala para conseguir algunas notas escala *= 2 notas = [frecuencias[0]] # Saltar de 2 en 2 intervalos produce una tercera actual = 0 for grupo in grouper(escala, 2, 0): actual += sum(grupo) notas.append(frecuencias[actual]) muestras_por_segundo = 44100 duracion = 0.5 muestras_totales = duracion * muestras_por_segundo muestras = [] for frecuencia in notas: ciclos_por_muestra = frecuencia / muestras_por_segundo incremento = 2 * math.pi * ciclos_por_muestra fase = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model', type=str, required=True) parser.add_argument('-weights', type=str, required=True) parser.add_argument('-results', type=str, required=True) parser.add_argument('-word_vector', type=str, default='') args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') questions_val = open('../data/preprocessed/questions_val2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_val = open('../data/preprocessed/questions_lengths_val2014.txt', 'r').read().decode('utf8').splitlines() answers_val = open('../data/preprocessed/answers_val2014_all.txt', 'r').read().decode('utf8').splitlines() images_val = open('../data/preprocessed/images_val2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' questions_lengths_val, questions_val, answers_val, images_val = (list(t) for t in zip(*sorted(zip(questions_lengths_val, questions_val, answers_val, images_val)))) print 'Model compiled, weights loaded' labelencoder = joblib.load('../models/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'Loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' nb_classes = 1000 y_predict_text = [] batchSize = 128 widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()] pbar = ProgressBar(widgets=widgets) for qu_batch,an_batch,im_batch in pbar(zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]), grouper(answers_val, batchSize, fillvalue=answers_val[0]), grouper(images_val, batchSize, fillvalue=images_val[0]))): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) if 'language_only' in args.model: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) X_batch = [X_q_batch, X_i_batch] y_predict = model.predict_classes(X_batch, verbose=0) y_predict_text.extend(labelencoder.inverse_transform(y_predict)) total = 0 correct_val=0.0 f1 = open(args.results, 'w') for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val): temp_count=0 for _truth in truth.split(';'): if prediction == _truth: temp_count+=1 if temp_count>2: correct_val+=1 else: correct_val+=float(temp_count)/3 total+=1 f1.write(question.encode('utf-8')) f1.write('\n') f1.write(image.encode('utf-8')) f1.write('\n') f1.write(prediction) f1.write('\n') f1.write(truth.encode('utf-8')) f1.write('\n') f1.write('\n') f1.write('Final Accuracy is ' + str(correct_val/total)) f1.close() f1 = open('../results/overall_results.txt', 'a') f1.write(args.weights + '\n') f1.write(str(correct_val/total) + '\n\n') f1.close() print 'Final Accuracy on the validation set is', correct_val/total
def main(): parser = argparse.ArgumentParser() parser.add_argument('-model', type=str, required=True) parser.add_argument('-weights', type=str, required=True) parser.add_argument('-results', type=str, required=True) args = parser.parse_args() model = model_from_json(open(args.model).read()) model.load_weights(args.weights) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') questions_val = open('../data/preprocessed/questions_val2014.txt', 'r').read().decode('utf8').splitlines() answers_val = open('../data/preprocessed/answers_val2014.txt', 'r').read().decode('utf8').splitlines() images_val = open('../data/preprocessed/images_val2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' print 'Model compiled, weights loaded...' labelencoder = joblib.load('../models/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print 'loaded word2vec features' nb_classes = 1000 y_predict_text = [] batchSize = 128 widgets = ['Evaluating ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()] pbar = ProgressBar(widgets=widgets) for qu_batch,an_batch,im_batch in pbar(zip(grouper(questions_val, batchSize, fillvalue=questions_val[0]), grouper(answers_val, batchSize, fillvalue=answers_val[0]), grouper(images_val, batchSize, fillvalue=images_val[0]))): X_q_batch = get_questions_matrix_sum(qu_batch, nlp) if 'language_only' in args.model: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, img_map , VGGfeatures) X_batch = np.hstack((X_q_batch, X_i_batch)) y_predict = model.predict_classes(X_batch, verbose=0) y_predict_text.extend(labelencoder.inverse_transform(y_predict)) correct_val=0 incorrect_val=0 f1 = open(args.results, 'w') for prediction, truth, question, image in zip(y_predict_text, answers_val, questions_val, images_val): temp_count=0 for _truth in truth.split(';'): if prediction == _truth: temp_count+=1 if temp_count>2: correct_val+=1 else: incorrect_val+=1 f1.write(question.encode('utf-8')) f1.write('\n') f1.write(image.encode('utf-8')) f1.write('\n') f1.write(prediction) f1.write('\n') f1.write(truth.encode('utf-8')) f1.write('\n') f1.write('\n') f1.write('Final Accuracy is ' + str(float(correct_val)/(incorrect_val+correct_val))) f1.close() f1 = open('../results/overall_results.txt', 'a') f1.write(args.weights + '\n') f1.write(str(float(correct_val)/(incorrect_val+correct_val)) + '\n') f1.close() print 'Final Accuracy on the validation set is', float(correct_val)/(incorrect_val+correct_val)
def main(): client = InfluxDBClient(host=args.influxdb_host, ssl=args.ssl, verify_ssl=False, port=8086, database=args.database) logger = configure_logging('parse_iostat') iostat_timezone = timezone(args.timezone) with open(args.input_file, 'r') as f: if args.hostname: f.__next__() # Skip the "Linux..." line else: hostname = re.split(r'[()]', f.readline())[1] logger.info("Found hostname {}".format(hostname)) f.__next__() # Skip the blank line line_counter = 2 for chunk_index, chunk in enumerate(grouper(parse_iostat(f), args.batch_size)): json_points = [] for block in chunk: if block: try: for i, line in enumerate(block): line_counter += 1 if i == 0: timestamp = iostat_timezone.localize(line) # print(timestamp) # import ipdb;ipdb.set_trace() # print("timestamp is {}".format(timestamp)) # TODO: Timezone? # TODO: Better way of storing timestamp elif i == 1: # CPU Metric Headings pass elif i==2: system_stats = dict(zip(system_stat_headers, line.split())) values = {} for metric_name, value in system_stats.items(): values[metric_name] = float(value) json_points.append({ "measurement": "iostat", "tags": { "project": args.project, "hostname": hostname }, "time": timestamp.isoformat(), "fields": values }) elif i==4: # Disk metric headings pass elif i >= 5 and line: disk_stats = {} device = line.split()[0] disk_stats[device] = dict(zip(disk_stat_headers, line.split()[1:])) for disk_name, metrics in disk_stats.items(): values = {} for metric_name, value in metrics.items(): # Nasty hack to deal with bad data from Morgan Stanley # if disk_name not in ['sda', 'sdb', 'dm-0', 'dm-1', 'dm-2']: # print(block) # raise ValueError values[metric_name] = float(value) json_points.append({ "measurement": "iostat", "tags": { "project": args.project, "hostname": hostname, "device": disk_name, }, "time": timestamp.isoformat(), "fields": values }) except ValueError as e: print("Bad output seen - skipping") print(e) print(block) write_points(logger, client, json_points, line_counter)