def download_quandl_dataset(quandl_api_key, database, dataset, save_path, columns, tickers, start_date, end_date): """ Download a dataset from Quandl and save it to `save_path`. Filter by columns, tickers, and date :param quandl_api_key: The Quandl API key :param database: The Quandl database to download from :param dataset: The dataset to download :param save_path: The path to save the dataset :param columns: The columns to save :param tickers: The tickers to save :param start_date: The rows to save that are older than this date :param end_date: The rows to save that are younger than this date """ scrape_url = 'https://www.quandl.com/api/v3/datatables/{}/{}?qopts.export=true&api_key={}'\ .format(database, dataset, quandl_api_key) scrape_request = requests.get(scrape_url) bulk_download_url = scrape_request.json()['datatable_bulk_download']['file']['link'] with tempfile.TemporaryDirectory() as tmp_dir: tmp_wiki_file = tmp_dir + 'tmp.zip' bulk_download_request = requests.get(bulk_download_url, stream=True, cookies=scrape_request.cookies) total_size = int(bulk_download_request.headers.get('content-length', 0)); block_size = 1024 * 1024 with open(tmp_wiki_file, 'wb') as f: for data in tqdm( bulk_download_request.iter_content(block_size), total=math.ceil(total_size // block_size), unit='MB', unit_scale=True, desc='Downloading Data'): f.write(data) with tqdm(total=5, desc='Transforming Data', unit='Action') as pbar: # Unzip downloaded data zip_ref = zipfile.ZipFile(tmp_wiki_file, 'r') zip_ref.extractall(tmp_dir) zip_ref.close() pbar.update(1) # Check if the zip file only contains one csv file # We're assuming that Quandl will always give us the data in a single csv file. # If it's different, we want to throw an error. csv_files = glob.glob(os.path.join(tmp_dir, '*.csv')) assert len(csv_files) == 1,\ 'Bulk download of Quandl Wiki data failed. Wrong number of csv files found. Found {} file(s).'\ .format(len(csv_files)) tmp_csv_file = csv_files[0] tmp_df = pd.read_csv(tmp_csv_file) pbar.update(1) tmp_df['date'] = pd.to_datetime(tmp_df['date']) pbar.update(1) # Remove unused data and save tmp_df = tmp_df[tmp_df['date'].isin(pd.date_range(start_date, end_date))] # Filter unused dates tmp_df = tmp_df[tmp_df['ticker'].isin(tickers)] # Filter unused tickers pbar.update(1) tmp_df.to_csv(save_path, columns=columns, index=False) # Filter unused columns and save pbar.update(1)
def fix_suspects(input_dir, output_dir): skipped_items = [] count = 0 with open("wonky_unitdate_display_candidates.csv", mode="r") as f: example_dict = {} reader = csv.reader(f) items = list(reader) items.reverse() # reverse the input list so that xpaths remain valid as I edit multiple entries in one ead for filename, xpath, text_with_tags, text_without_tags, action in tqdm(items): example_dict[filename] = example_dict.get(filename, []) example_dict[filename].append((xpath, action, text_without_tags)) for ead, dict_value_list in tqdm(example_dict.items()): tree = etree.parse(os.path.join(input_dir, ead)) for xpath, action, text in dict_value_list: unittitle = tree.xpath(xpath)[0] #disparity = find_date_disparity(unittitle) #if disparity > 10 and action == "move_and_calcify" and ead != "geolsurv.xml": #skipped_items.append([ead, xpath, text, disparity, action]) #else: #move_unitdates(unittitle, action) move_unitdates(unittitle, action) with open(os.path.join(output_dir, ead), mode="w") as f: f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True, encoding="utf-8")) #with open("skipped_items.csv", mode="wb") as f: #writer = csv.writer(f) #writer.writerows(skipped_items) print("Skipped {0} entries".format(len(skipped_items)))
def run(): batch_size = 4000 global signatures signatures = get_pickled_signatures() pool = avito_utils.PoolWrapper(processes=4) name = 'ssim' print 'processing train data...' t0 = time() df = pd.read_csv('../input/ItemPairs_train.csv') delete_file_if_exists('features_%s_train.csv' % name) for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): features = process_batch(batch, pool) append_to_csv(features, 'features_%s_train.csv' % name) print 'processing train data took %0.5fs' % (time() - t0) print 'processinig test data...' t0 = time() df = pd.read_csv('../input/ItemPairs_test.csv') delete_file_if_exists('features_%s_test.csv' % name) for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): features = process_batch(batch, pool) append_to_csv(features, 'features_%s_test.csv' % name) print 'processing test data took %0.5fs' % (time() - t0) pool.close()
def augment_arrays(project): array_path = os.path.join(project['path'], 'array') augmented_path = os.path.join(project['path'], 'augmented') shutil.rmtree(augmented_path,ignore_errors=True) os.makedirs(augmented_path) if project['augmentations'] is None: print('No augmentations selected: copying train arrays as is.') files = os.listdir(array_path) for file in tqdm(files): shutil.copy(os.path.join(array_path, file),augmented_path) else: print('Generating image augmentations:') for img_idx, (array, label, label_name) in tqdm(enumerate(gen_arrays_from_dir(array_path))): split_label_name = '-'.join(label_name.split('-')[2:-1]) for aug_idx, (array_aug, label_aug) in enumerate(gen_augment_arrays(array, label, project['augmentations'], project['category_rounds'][split_label_name])): cat_idx = np.argmax(label_aug) cat = project['categories'][cat_idx] img_name = '{}-{:02d}-img-{}-{}'.format(img_idx, aug_idx, cat, cat_idx) label_name = '{}-{:02d}-label-{}-{}'.format(img_idx, aug_idx, cat, cat_idx) aug_path = os.path.join(augmented_path, img_name) label_path = os.path.join(augmented_path, label_name) np.save(aug_path, array_aug) np.save(label_path, label_aug) project['is_augmented'] = True return project
def find_duplicates(directories): for d in directories: if not os.path.exists(d): raise ValueError("Directory %s does not exist" % d) elif not os.path.isdir(d): raise ValueError("Expected %s to be a directory" % d) file_hashes = defaultdict(set) print("Scanning for files…") all_files = deque() for filename in tqdm(find_files(directories)): all_files.append(filename) print("Hashing %d files" % len(all_files)) with ThreadPoolExecutor() as executor: for filename, digest in tqdm( executor.map(get_file_hash, all_files), total=len(all_files) ): file_hashes[digest].add(filename) for digest, filenames in file_hashes.items(): if len(filenames) < 2: continue else: yield digest, filenames
def run(): global mongo, scaler mongo = MongoWrapper(avito_utils.avito_db) scaler = prepare_scaler() batch_size = 8000 name = 'imagemagick' pool = avito_utils.PoolWrapper() t0 = time() df = pd.read_csv('../input/ItemPairs_train.csv') delete_file_if_exists('features_%s_train.csv' % name) print 'read train set, start processing...' for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): batch = process_batch(batch, pool) append_to_csv(batch, 'features_%s_train.csv' % name) print 'processing train set took %0.5fs' % (time() - t0) t0 = time() df = pd.read_csv('../input/ItemPairs_test.csv') delete_file_if_exists('features_%s_test.csv' % name) print 'read test set, start processing...' for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): batch = process_batch(batch, pool) append_to_csv(batch, 'features_%s_test.csv' % name) print 'processing test set took %0.5fs' % (time() - t0) pool.close()
def to_html(self, outdir, template=None): pages_set = self.pages_set if template is None: template = textwrap.dedent("""\ <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <title>Page {page}</title> <link rel="stylesheet" type="text/css" href="teletext.css" title="Default Style"/> <link rel="alternative stylesheet" type="text/css" href="teletext-noscanlines.css" title="No Scanlines"/> <script type="text/javascript" src="cssswitch.js"></script> </head> <body onload="set_style_from_cookie()"> {body} </body> </html> """) for magazineno, magazine in tqdm(self.magazines.items(), desc='Magazines', unit='M'): for pageno, page in tqdm(magazine.pages.items(), desc='Pages', unit='P'): pagestr = f'{magazineno}{pageno:02x}' outfile = open(os.path.join(outdir, f'{pagestr}.html'), 'w') body = '\n'.join( subpage.to_html(pages_set) for n, subpage in sorted(page.subpages.items()) ) outfile.write(template.format(page=pagestr, body=body))
def test_word2id(): """把测试集的所有词转成对应的id。""" time0 = time.time() print('Processing eval data.') df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4], names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object}) print('test question number %d' % len(df_eval)) # 没有 title 的问题用 content 来替换 na_title_indexs = list() for i in xrange(len(df_eval)): word_title = df_eval.word_title.values[i] if type(word_title) is float: na_title_indexs.append(i) print('There are %d test questions without title.' % len(na_title_indexs)) for na_index in na_title_indexs: df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content'] # 没有 content 的问题用 title 来替换 na_content_indexs = list() for i in tqdm(xrange(len(df_eval))): word_content = df_eval.word_content.values[i] if type(word_content) is float: na_content_indexs.append(i) print('There are %d test questions without content.' % len(na_content_indexs)) for na_index in tqdm(na_content_indexs): df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title'] # 转为 id 形式 p = Pool() eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values)) np.save('../data/wd_eval_title.npy', eval_title) eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values)) np.save('../data/wd_eval_content.npy', eval_content) p.close() p.join() print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0))
def train_word2id(): """把训练集的所有词转成对应的id。""" time0 = time.time() print('Processing train data.') df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4], names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object}) print('training question number %d ' % len(df_train)) # 没有 content 的问题用 title 来替换 na_content_indexs = list() for i in tqdm(xrange(len(df_train))): word_content = df_train.word_content.values[i] if type(word_content) is float: na_content_indexs.append(i) print('There are %d train questions without content.' % len(na_content_indexs)) for na_index in tqdm(na_content_indexs): df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title'] # 没有 title 的问题, 丢弃 na_title_indexs = list() for i in xrange(len(df_train)): word_title = df_train.word_title.values[i] if type(word_title) is float: na_title_indexs.append(i) print('There are %d train questions without title.' % len(na_title_indexs)) df_train = df_train.drop(na_title_indexs) print('After dropping, training question number(should be 2999952) = %d' % len(df_train)) # 转为 id 形式 p = Pool() train_title = np.asarray(p.map(get_id4words, df_train.word_title.values)) np.save('../data/wd_train_title.npy', train_title) train_content = np.asarray(p.map(get_id4words, df_train.word_content.values)) np.save('../data/wd_train_content.npy', train_content) p.close() p.join() print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0))
def merge_po_files(po_files, fuzzy=False): """Find each po file from the current directory, group them by name, and replicate known translations from each one to the others. """ """Given a list of po files, replicate known translation from each file to the others. """ known_translations = {} # Aggregate all known translations for po_file in tqdm(po_files, desc="Searching known translations"): po_file = polib.pofile(po_file) for entry in po_file: if 'fuzzy' not in entry.flags and entry.msgstr != '': known_translations[entry.msgid] = entry.msgstr # Propagate them done = 0 for po_file in tqdm(po_files, desc="Replicating them"): po_file = polib.pofile(po_file) for entry in po_file: if entry.msgid in known_translations: entry.msgstr = known_translations[entry.msgid] elif fuzzy: best_match = find_best_match(list(known_translations.keys()), entry.msgid) if best_match is not None: print("I think\n {}\n =\n {}".format(entry.msgid, best_match)) entry.msgstr = known_translations[best_match] entry.flags.append('fuzzy') po_file.save()
def generate_code(self, Modal, bit, generate): batch_size = 128 if generate=="label": num_data = Modal.shape[0] index = np.linspace(0, num_data - 1, num_data).astype(int) B = np.zeros([num_data, bit], dtype=np.float32) for iter in tqdm(xrange(num_data / batch_size + 1)): ind = index[iter * batch_size: min((iter + 1) * batch_size, num_data)] label = Modal[ind, :].astype(np.float32) label = label.reshape([label.shape[0], 1, label.shape[1], 1]) Hsh_L = self.Hsh_L.eval(feed_dict={self.ph['label_input']: label}) B[ind, :] = Hsh_L elif generate=="image": num_data = len(Modal) index = np.linspace(0, num_data - 1, num_data).astype(int) B = np.zeros([num_data, bit], dtype=np.float32) for iter in tqdm(xrange(num_data / batch_size + 1)): ind = index[iter * batch_size: min((iter + 1) * batch_size, num_data)] mean_pixel = np.repeat(self.meanpix[:, :, :, np.newaxis], len(ind), axis=3) image = Modal[ind,:,:,:].astype(np.float64) image = image - mean_pixel.astype(np.float64).transpose(3, 0, 1, 2) Hsh_I = self.Hsh_I.eval(feed_dict={self.ph['image_input']: image}) B[ind, :] = Hsh_I else: num_data = Modal.shape[0] index = np.linspace(0, num_data - 1, num_data).astype(int) B = np.zeros([num_data, bit], dtype=np.float32) for iter in tqdm(xrange(num_data / batch_size + 1)): ind = index[iter * batch_size: min((iter + 1) * batch_size, num_data)] text = Modal[ind, :].astype(np.float32) text = text.reshape([text.shape[0], 1, text.shape[1], 1]) Hsh_T = self.Hsh_T.eval(feed_dict={self.ph['text_input']: text}) B[ind, :] = Hsh_T B = np.sign(B) return B
def read_raw_docs(lines: List[str], size: int, workers: int) -> np.ndarray: if size == -1: size = len(lines) lines = lines[:size] documents = np.empty(size, dtype=object) memory_impact = sum([sys.getsizeof(s) for s in lines]) # jeopardy 32862372 # recipes 187414159 if memory_impact < 50000000: offset = 0 linebins = np.array_split(lines, workers) # this is the offending large memory line with concurrent.futures.ProcessPoolExecutor() as executor: futures = {executor.submit(clean_text, linebins[i]): i for i in range(workers)} for future in tqdm(concurrent.futures.as_completed(futures), desc='Tokenizing Documents', total=workers, leave=True): index = futures[future] for i, line in enumerate(future.result()): documents[offset + i] = line offset += len(future.result()) else: print('Use Large Memory Algorithm') offset = 0 with concurrent.futures.ProcessPoolExecutor() as executor: futures = {executor.submit(clean_line, lines[i]): i for i in range(size)} for future in tqdm(concurrent.futures.as_completed(futures), desc='Tokenizing Documents', total=size, leave=True): documents[offset] = future.result() offset += 1 return documents
def pro_progess(filepath="../data"): height = 299 train_files = os.listdir(filepath + '/train') train = np.zeros((len(train_files), height, height, 3), dtype=np.uint8) labels = list(filter(lambda x: x[:3] == 'dog', train_files)) test_files = os.listdir(filepath + '/test') test = np.zeros((len(test_files), height, height, 3), dtype=np.uint8) for i in tqdm(range(len(train_files))): filename = filepath + train_files[i] img = cv2.imread(filename) img = cv2.resize(img, (height, height)) train[i] = img[:, :, ::-1] for i in tqdm(range(len(test_files))): filename = filepath + test_files[i] img = cv2.imread(filename) img = cv2.resize(img, (height, height)) test[i] = img[:, :, ::-1] print ('Training Data Size = %.2 GB' % (sys.getsizeof(train)/1024**3)) print ('Testing Data Size = %.2 GB' % (sys.getsizeof(test)/1024**3)) X_train, X_val, y_train, y_val = train_test_split( train, labels, shuffle=True, test_size=0.2, random_state=42) return X_train, X_val, y_train, y_val
def run(): textfiles = glob.glob('anjuke_new_house/*txt') if len(textfiles) != 0: print ">> compress files under anjuke_new_house" f = zipfile.ZipFile('anjuke_new_house/anjuke_new_house.zip', 'w', zipfile.ZIP_DEFLATED) for textfile in tqdm(textfiles): f.write(textfile) os.remove(textfile) f.close() textfiles = glob.glob('anjuke_second_house/*txt') if len(textfiles) != 0: print ">> compress files under anjuke_second_house" f = zipfile.ZipFile('anjuke_second_house/anjuke_second_house.zip', 'w', zipfile.ZIP_DEFLATED) for textfile in tqdm(textfiles): f.write(textfile) os.remove(textfile) f.close() textfiles = glob.glob('anjuke_renting_house/*txt') if len(textfiles) != 0: print ">> compress files under anjuke_renting_house" f = zipfile.ZipFile('anjuke_renting_house/anjuke_renting_house.zip', 'w', zipfile.ZIP_DEFLATED) for textfile in tqdm(textfiles): f.write(textfile) os.remove(textfile) f.close()
def createDataTxt(imagePath, annotationPath, imagesInDir, split=False): JPG = '.jpg' TRAINING = 'training/' VALIDATION = 'validation/' if split: annotatedImages = os.listdir(annotationPath) # np.random.shuffle(annotatedImages) splitSize = ceil(len(annotatedImages) * 0.85) annotatedImagesTrain = annotatedImages[:splitSize] annotatedImagesValidation = annotatedImages[splitSize:] else: annotatedImagesTrain = os.listdir(join(annotationPath, TRAINING)) annotatedImagesValidation = os.listdir(join(annotationPath, VALIDATION)) with open(imagesInDir + 'train.txt', 'w') as file: for ann in tqdm(annotatedImagesTrain, desc='Writing train.txt for input dataset'): if isfile(join(imagePath, TRAINING, splitext(ann)[0]) + JPG): file.write(' '.join( [join(imagePath, TRAINING, splitext(ann)[0]) + JPG, join(annotationPath, TRAINING, ann)]) + '\n') with open(imagesInDir + 'val.txt', 'w') as file: for annv in tqdm(annotatedImagesValidation, desc='Writing valid.txt for input dataset'): if isfile(join(imagePath, VALIDATION, splitext(annv)[0]) + JPG): file.write(' '.join( [join(imagePath, VALIDATION, splitext(annv)[0]) + JPG, join(annotationPath, VALIDATION, annv)]) + '\n') return
def store_contents(data_path, save_path, preprocess, num_workers=None): """Preprocess and store a corpus of documents in sqlite. Args: data_path: Root path to directory (or directory of directories) of files containing json encoded documents (must have `id` and `text` fields). save_path: Path to output sqlite db. preprocess: Path to file defining a custom `preprocess` function. Takes in and outputs a structured doc. num_workers: Number of parallel processes to use when reading docs. """ if os.path.isfile(save_path): raise RuntimeError('%s already exists! Not overwriting.' % save_path) logger.info('Reading into database...') conn = sqlite3.connect(save_path) c = conn.cursor() c.execute("CREATE TABLE documents (id PRIMARY KEY, text);") workers = ProcessPool(num_workers, initializer=init, initargs=(preprocess,)) files = [f for f in iter_files(data_path)] count = 0 with tqdm(total=len(files)) as pbar: for pairs in tqdm(workers.imap_unordered(get_contents, files)): count += len(pairs) c.executemany("INSERT INTO documents VALUES (?,?)", pairs) pbar.update() logger.info('Read %d docs.' % count) logger.info('Committing...') conn.commit() conn.close()
def preprocess_simple_predict(): df = pd.read_csv('data/data_full.csv') df = df[df.is_fake==0] res_df = df.ID.values df_target = df[df.target > 0].drop('ID,is_train,is_fake'.split(','), axis=1) target = df_target.target.values data = df_target.drop(['target',], axis=1).values.astype(int) val_sum = {} for i, dat in tqdm(enumerate(data)): for d in dat: if d <= 0: continue if d not in val_sum: val_sum[d] = [0, 0] val_sum[d][0] += target[i] val_sum[d][1] += 1 df['simple_predict'] = 0 for i, row in tqdm(df.drop('ID,is_train,is_fake,target'.split(','), axis=1).iterrows()): summ = 0 cnt = 0.000001 for val in row: if val not in val_sum or val_sum[val][1] < 10: continue summ += val_sum[val][0] cnt += val_sum[val][1] df.loc[i, 'simple_predict'] = summ / cnt df[['ID', 'simple_predict']].to_csv('data/feat_simple_predict.csv', index=False)
def test_pandas_groupby_apply(): """ Test pandas.DataFrame.groupby(...).progress_apply """ try: from numpy.random import randint from tqdm import tqdm_pandas import pandas as pd except: raise SkipTest with closing(StringIO()) as our_file: df = pd.DataFrame(randint(0, 50, (500, 3))) dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc')) tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True)) df.groupby(0).progress_apply(lambda x: None) tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True)) dfs.groupby(['a']).progress_apply(lambda x: None) our_file.seek(0) # don't expect final output since no `leave` and # high dynamic `miniters` nexres = '100%|##########|' if nexres in our_file.read(): our_file.seek(0) raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format( nexres, our_file.read()))
def process_ngram(ngram_generator, limit=LIMIT): result = {} print 'Processing ngrams:' for ngram in tqdm(ngram_generator): if ngram not in result: result[ngram] = 1 else: result[ngram] += 1 print 'Removing results with n=1:' to_remove = set() for ngram in tqdm(result): if result[ngram] <= 1: to_remove.add(ngram) for ngram in to_remove: del result[ngram] sorted_result = sorted(result.items(), lambda x, y: result[x[0]].__cmp__(result[y[0]]), reverse=True) if limit: return sorted_result[:limit] else: return sorted_result
def getFeatures(self): files = glob.glob(self.objectPath+self.preProcessedData+'*.npy') split_length = None if self.windowSize != "None": split_length = self.windowSize * self.samplingFrequency split_based = open(self.objectPath+self.dataFeatures+self.featureExtracted, 'w', newline='') writer = csv.writer(split_based, delimiter=',') header_writen = False for file in tqdm(files): file_split = file.split('_') recording_class = file_split[2] recording = np.load(file) i = 0 for channel in tqdm(recording): if self.windowSize == "None": split_length = len(channel) limit = int(len(channel)/split_length)*split_length channel = channel[0:limit] splits = np.split(channel,limit//split_length) j = 1 for split in tqdm(splits): self.channel_data = split data_ = self.runPipeline() temp = [file_split[0],recording_class,self.channels[i],j] features = list(data_[0]) if not header_writen: writer.writerow( ['filename','experiment_identifier','channel_name','split_number'] + list(data_[1]) ) header_writen = True writer.writerow(temp+features) #break j += 1 #break i += 1
def clone(self, url=None, update_existing_config=False): """Clone a data store Parameters ---------- url : str URL of the data store to clone update_existing_config : bool If True, updates the existing config file to point to the given file for the store configuration """ from tqdm import tqdm try: makedirs(self.powdir) print('Cloning...', file=sys.stderr) with tqdm(file=sys.stderr, unit=' objects', miniters=0) as progress: self.repository_provider.clone(url, base=self.powdir, progress=progress) if not exists(self.config_file): self._init_config_file() self._init_store() print('Deserializing...', file=sys.stderr) with tqdm(unit=' ctx', file=sys.stderr) as ctx_prog, \ tqdm(unit=' triples', file=sys.stderr, leave=False) as trip_prog: self._load_all_graphs(ctx_prog, trip_prog) print('Done!', file=sys.stderr) except BaseException as e: self._ensure_no_powdir() raise e
def predict_kfold(cls, X, y, n_folds=10, seed=0, textModel_params={}, kfolds=None, pool=None, use_tqdm=True): try: from tqdm import tqdm except ImportError: def tqdm(x, **kwargs): return x le = preprocessing.LabelEncoder().fit(y) y = np.array(le.transform(y)) hy = np.zeros(len(y), dtype=np.int) if kfolds is None: kfolds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X, y) args = [(X, y, tr, ts, textModel_params) for tr, ts in kfolds] if pool is not None: if use_tqdm: res = [x for x in tqdm(pool.imap_unordered(cls.train_predict_pool, args), desc='Params', total=len(args))] else: res = [x for x in pool.imap_unordered(cls.train_predict_pool, args)] else: if use_tqdm: args = tqdm(args) res = [cls.train_predict_pool(x) for x in args] for ts, _hy in res: hy[ts] = _hy return le.inverse_transform(hy)
def normalize_features(X_train, X_test): n_features = X_train.shape[1] feature_sums = np.sum(X_test, axis=1) nonblack_vectors = np.where(feature_sums > 0,1,0) #print nonblack_vectors.shape mask = [] for x in range(X_test.shape[0]): mask.append([nonblack_vectors[x]]*n_features) mask = np.array(mask) X_test_nonblack = X_test[np.where(feature_sums > 0)] X = np.concatenate((X_train, X_test_nonblack)) #print X, X.shape mean = np.mean(X,axis=0) std = np.std(X,axis=0) for d in tqdm(range(len(X_train))): X_train[d] = (X_train[d] - mean) / std for d in tqdm(range(len(X_test))): X_test[d] = (X_test[d] - mean) / std #Make once fully black vectors fully black again X_test = X_test*mask return X_train, X_test
def scan_dir(path, dir_json): # Preprocess the total files count for root, dirs, files in tqdm(os.walk(path)): for name in files: path = os.path.join(root, name) if os.path.getsize(path) > (25*1024*1024): ext = os.path.splitext(name)[1] if ext in EXT: movie_name.append(name) with tqdm(total=len(movie_name), leave=True, unit='B', unit_scale=True) as pbar: for name in movie_name: data = get_movie_info(name) pbar.update() if data is not None and data['Response'] == 'True': for key, val in data.items(): if val == "N/A": data[key] = "-" # Should N/A be replaced with `-`? movies.append(data) else: if data is not None: movie_not_found.append(name) with open(dir_json, "w") as out: json.dump(movies, out, indent=2)
def compare_assemblies(assemblies, chunk_size = 2000, identity_threshold = 0.40): """ compares a set of assemblies: assemblies is a dictionary with names of the assemblies as keys and fasta-files of the assemblies as values """ similarities = {} print "make blast dbs" for subject_name, subject in tqdm(assemblies.iteritems()): blast_db_cmd = ["makeblastdb" ,"-in", subject, "-dbtype", "nucl", "-out", subject] with open("/dev/null") as null: blastdb_return = call(blast_db_cmd, stdout=null) print "Run the hell out of it" for scaff_name, scaff in tqdm(assemblies.iteritems()): similarities[scaff_name] = {} chopped_up_query = "tmp.fasta" nb_chunks = len(cut_up_fasta(scaff, chopped_up_query, chunk_size)) for subject_name, subject in assemblies.iteritems(): nics = find_NICs(chopped_up_query, subject, identity_threshold, blast_db = False) # print scaff_name, "vs", subject_name similarities[scaff_name][subject_name] = len(nics.keys())/nb_chunks os.remove(chopped_up_query) print "clean up" for subject_name, subject in tqdm(assemblies.iteritems()): blast_db_files = [subject + ".nhr", subject + ".nin", subject + ".nsq"] for f in blast_db_files: os.remove(f) similars = DataFrame.from_dict(similarities) return similars
def run(*args): """Reset the in_stock Card property. It was set to True by default, it should be False. So each card that was bought once or added from an inventory should be to True. """ yes_answers = ["y", "Y", "o", "O", ""] go_all_cards = raw_input("Go with all cards ? [Y/n]") go_inventories = raw_input("Go with cards applied from inventories ? [Y/n]") if go_all_cards in yes_answers: print("Setting all cards to not in stock...") for card in tqdm(Card.objects.all()): card.in_stock = False card.save() if go_inventories in yes_answers: print("Registering cards applied from inventories...") for inv in tqdm(Inventory.objects.filter(applied=True)): print("Going with inv {}".format(inv.name)) for card_set in inv.inventorycopies_set.all(): card_set.card.in_stock = True card_set.card.save() print("All done.")
def run(): batch_size = 4000 print 'reading image hashes from image_hashes.csv...', t0 = time() global df_hashes df_hashes = pd.read_csv('image_hashes.csv') df_hashes.set_index('image_id', inplace=1) print 'took %0.5fs' % (time() - t0) pool = avito_utils.PoolWrapper(processes=4) print 'processing train data...' t0 = time() df = pd.read_csv('../input/ItemPairs_train.csv') delete_file_if_exists('features_imagehash_train.csv') for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): features = process_batch(batch, pool) append_to_csv(features, 'features_imagehash_train.csv') print 'processing train data took %0.5fs' % (time() - t0) print 'processinig test data...' t0 = time() df = pd.read_csv('../input/ItemPairs_test.csv') delete_file_if_exists('features_imagehash_test.csv') for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))): features = process_batch(batch, pool) append_to_csv(features, 'features_imagehash_test.csv') print 'processing test data took %0.5fs' % (time() - t0) pool.close()
def make_tqdm_iterator(**kwargs): options = { "file": sys.stdout, "leave": True } options.update(kwargs) if session_type() == 'kernel': # from IPython import display # capture_stderr = StringIO() # with RedirectStdStreams(stderr=capture_stderr): # try: # iterator = tqdm_notebook(**options) # except: # failed = True # else: # failed = False # err_out = capture_stderr.getvalue() # capture_stderr.close() # if failed or err_out.lower().find("widget javascript not detected") > -1: # display.clear_output(wait=True) # iterator = tqdm(**options) iterator = tqdm(**options) else: iterator = tqdm(**options) return iterator
def download_url(url, root, filename, md5): from six.moves import urllib root = os.path.expanduser(root) fpath = os.path.join(root, filename) try: os.makedirs(root) except OSError as e: if e.errno == errno.EEXIST: pass else: raise # downloads file if os.path.isfile(fpath) and check_integrity(fpath, md5): print('Using downloaded and verified file: ' + fpath) else: try: print('Downloading ' + url + ' to ' + fpath) urllib.request.urlretrieve( url, fpath, reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True)) ) except: if url[:5] == 'https': url = url.replace('https:', 'http:') print('Failed download. Trying https -> http instead.' ' Downloading ' + url + ' to ' + fpath) urllib.request.urlretrieve( url, fpath, reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True)) )
def test_ascii(): """ Test ascii/unicode bar """ # Test ascii autodetection with closing(StringIO()) as our_file: with tqdm(total=10, file=our_file, ascii=None) as t: assert t.ascii # TODO: this may fail in the future # Test ascii bar with closing(StringIO()) as our_file: for _ in tqdm(_range(3), total=15, file=our_file, miniters=1, mininterval=0, ascii=True): pass our_file.seek(0) res = our_file.read().strip("\r").split("\r") assert '7%|6' in res[1] assert '13%|#3' in res[2] assert '20%|##' in res[3] # Test unicode bar with closing(UnicodeIO()) as our_file: with tqdm(total=15, file=our_file, ascii=False, mininterval=0) as t: for _ in _range(3): t.update() our_file.seek(0) res = our_file.read().strip("\r").split("\r") assert "7%|\u258b" in res[1] assert "13%|\u2588\u258e" in res[2] assert "20%|\u2588\u2588" in res[3]
def paramsearch(binned, samples_per_knot, n_valid_samples, n_train_folds=3, n_valid_folds=1, n_test_folds=1, knot_range=(-1, 2), smoothness_range=(1e-2, 1e2), warpreg_range=(1e-2, 1e1), iter_range=(50, 300), warp_iter_range=(50, 300), outfile=None): """ Performs nested cross-validation over shift-only, linear, and piecewise linear warping models, in order to tune all hyperparmeters and compare performance. For each set of randomly sampled parameters, trials and units are randomly split `n_folds` times into train/test groups. An R-squared metric of across-trial reliability is measured on each test set; larger scores indicate warping functions that generalize better. Parameters ---------- binned : ndarray trials x timepoints x neurons binned spikes samples_per_knot : int Number of cross-validation runs per knot. n_valid_samples : int Number of inner samples to optimize smoothness and warp complexity regularization parameters on validation set. n_train_folds : int Number of folds used for training. n_valid_folds : int Number of folds used for validation. n_test_folds : int Number of folds used for testing. knot_range : tuple of ints Specifies [minimum, maximum) number of knots in warping functions. A value of -1 denotes a shift-only warping model; a value of 0 denotes a linear warping model (no interior knots); etc. smoothness_range : tuple of floats Specifies [minimum, maximum) strength of regularization on template smoothness; larger values penalize roughness over time more stringently. The regularization strength for each model is randomly sampled from a log-uniform distribution over this interval. warpreg_range : tuple of floats Specifies [minimum, maximum) strength of regularization on the area between the warping functions and the identity line; larger values penalize warping more stringently. The regularization strength for each model is randomly sampled from a log-uniform distribution over this interval. iter_range : tuple of ints Specifies [minimum, maximum) number of iterations used to optimize each model, which are sampled log-uniformly over this interval and constrained to be integer-valued. warp_iter_range : tuple of ints Specifies [minimum, maximum) number of inner iterations to apply to update the warping functions on each step of optimization. These are also randomly sampled log-uniformly over the specified interval. outfile : None or str (optional) If provided, data are saved after each iteration to this filename. Returns ------- results : dict Dictionary holding results: "knots" : (n_samples,) array holding number of knots in piecewise linear warping function for each evaluated model. "smoothness" : (n_samples, n_valid_samples) array holding sampled regularization strengths on warping templates, penalizing roughness. "warp_reg" : (n_samples, n_valid_samples) array holding sampled regularization strengths on warping function distance from identity. "iterations" : (n_samples, n_valid_samples) array holding number of model optimization steps. "warp_iterations" : (n_samples, n_valid_samples) array holding number of inner iteration steps for fitting warping functions. "train_rsq": (n_samples, n_valid_samples) array holding model performance on the training set. "valid_rsq": (n_samples, n_valid_samples) array holding model performance on the validation set. "test_rsq": (n_samples,) array holding model performance on the test set. "loss_hists" : (n_samples, n_valid_samples, n_iterations + 1) array holding the learning curves for all models. The loss is computed over the combined train and validation set. Notes ----- Only implemented for quadratic loss. """ # Dataset dimensions (trials x timepoints x units). K, T, N = binned.shape # Randomly draw all parameter settings for each model. knots = np.tile(np.arange(*knot_range), samples_per_knot) n_samples = len(knots) smoothness = _sample_log_uniform(smoothness_range, size=(n_samples, n_valid_samples)) warp_reg = _sample_log_uniform(warpreg_range, size=(n_samples, n_valid_samples)) iterations = _sample_log_uniform(iter_range, size=(n_samples, n_valid_samples)).astype('int') warp_iterations = _sample_log_uniform(warp_iter_range, size=(n_samples, n_valid_samples)).astype('int') # Initialize arrays to store losses. train_rsq = np.empty((n_samples, n_valid_samples)) valid_rsq = np.full((n_samples, n_valid_samples), -np.inf) test_rsq = np.empty(n_samples) loss_hists = np.full((n_samples, n_valid_samples, iter_range[1]), np.nan) progress_bar = tqdm(total=n_samples * n_valid_samples) for i, j in itertools.product(range(n_samples), range(n_valid_samples)): # Update train - validation - test sets. if j == 0: train_units, val_units, test_units = _crossval_partition( N, n_train_folds, n_valid_folds, n_test_folds) train_trials, val_trials, test_trials = _crossval_partition( K, n_train_folds, n_valid_folds, n_test_folds) # Create model instance. model_kw = { "smoothness_reg_scale": smoothness[i, j], "warp_reg_scale": warp_reg[i, j] } if knots[i] == -1: model = ShiftWarping(**model_kw) else: model = PiecewiseWarping(n_knots=knots[i], **model_kw) # Fit model. fit_kw = { "verbose": False, "iterations": iterations[i, j], "warp_iterations": warp_iterations[i, j], "neuron_idx": train_units, "trial_idx": train_trials, } model.fit(binned, **fit_kw) # Store optimization learning curve. loss_hists[i, j, :(iterations[i, j] + 1)] = model.loss_hist # Create baseline model (simple trial average). baseline_pred = np.tile( np.mean(binned[train_trials], axis=0, keepdims=True), (binned.shape[0], 1, 1)) # Record loss on training set. pred = model.predict() train_rsq[i, j] = 1 - ( _crossval_loss(pred, binned, train_trials, train_units) / _crossval_loss(baseline_pred, binned, train_trials, train_units)) # Record loss on validation set. valid_rsq[i, j] = 1 - ( _crossval_loss(pred, binned, val_trials, val_units) / _crossval_loss(baseline_pred, binned, val_trials, val_units)) # Save loss on test set if validation loss is optimal if np.argmax(valid_rsq[i]) == j: test_rsq[i] = 1 - ( _crossval_loss(pred, binned, test_trials, test_units) / _crossval_loss(baseline_pred, binned, test_trials, test_units)) # Save results. if j == n_valid_samples - 1: results = { "knots": knots[:(i + 1)], "smoothness": smoothness[:(i + 1)], "warp_reg": warp_reg[:(i + 1)], "iterations": iterations[:(i + 1)], "warp_iterations": warp_iterations[:(i + 1)], "train_rsq": train_rsq[:(i + 1)], "valid_rsq": valid_rsq[:(i + 1)], "test_rsq": test_rsq[:(i + 1)], "loss_hists": loss_hists[:(i + 1)], } if outfile is not None: np.savez(outfile, **results) # Update progress bar. progress_bar.update(1) return results
def train_one( config: Config, train_batches: List[Tuple], dev_insts: List[Instance], dev_batches: List[Tuple], model_name: str, test_insts: List[Instance] = None, test_batches: List[Tuple] = None, config_name: str = None, result_filename: str = None, ) -> NNCRF: model = NNCRF(config) model.train() optimizer = get_optimizer(config, model) lr_scheduler = SlantedTriangular(optimizer, config.num_epochs, num_steps_per_epoch=len(train_batches), ratio=16) epoch = config.num_epochs best_dev_f1 = -1 saved_test_metrics = None for i in range(1, epoch + 1): epoch_loss = 0 start_time = time.time() model.zero_grad() # if config.optimizer.lower() == "sgd": # optimizer = lr_decay(config, optimizer, i) lr_scheduler.step(epoch=i) for index in tqdm(np.random.permutation(len(train_batches)), f"Training epoch {i}", len(train_batches)): model.train() loss = model(*train_batches[index]) epoch_loss += loss.item() # print(f"Batch loss: {loss.item()}") loss.backward() optimizer.step() model.zero_grad() lr_scheduler.step_batch() end_time = time.time() print("Epoch %d: %.5f, Time is %.2fs" % (i, epoch_loss / len(train_batches), end_time - start_time), flush=True) model.eval() # metric is [precision, recall, f_score] dev_metrics = evaluate_model(config, model, dev_batches, "dev", dev_insts) if test_insts is not None: test_metrics = evaluate_model(config, model, test_batches, "test", test_insts) if dev_metrics[2] > best_dev_f1: print("saving the best model...") best_dev_f1 = dev_metrics[2] if test_insts is not None: saved_test_metrics = test_metrics torch.save(model.state_dict(), model_name) # # Save the corresponding config as well. if config_name: f = open(config_name, "wb") pickle.dump(config, f) f.close() if result_filename: write_results(result_filename, test_insts) model.zero_grad() if test_insts is not None: print(f"The best dev F1: {best_dev_f1}") print(f"The corresponding test: {saved_test_metrics}") return model
def handle_update(args): config = Configuration(use_config_cache=args.config_cache) failed_pulls = [] failed_clones = [] assert_lfs_installed() if not args.no_config: logger.info("Updating orchestra configuration") if not git_pull(config.orchestra_dotdir): failed_pulls.append(f"orchestra configuration ({config.orchestra_dotdir})") logger.info("Updating binary archives") os.makedirs(config.binary_archives_dir, exist_ok=True) progress_bar = tqdm(config.binary_archives_remotes.items(), unit="archives") for name, url in progress_bar: binary_archive_path = os.path.join(config.binary_archives_dir, name) progress_bar.set_postfix_str(f"{name}") if os.path.exists(binary_archive_path): logger.debug(f"Pulling binary archive {name}") if not pull_binary_archive(name, config): failed_pulls.append(f"Binary archive {name} ({os.path.join(config.binary_archives_dir, name)})") else: logger.info(f"Trying to clone binary archive from remote {name} ({url})") if not clone_binary_archive(name, url, config): failed_clones.append(f"Binary archive {name} ({url})!") logger.info("Resetting ls-remote cached info") ls_remote_cache = os.path.join(config.cache_dir, "remote_refs_cache.json") if os.path.exists(ls_remote_cache): os.remove(ls_remote_cache) logger.info("Updating ls-remote cached info") failed_ls_remotes = config.remote_heads_cache.rebuild_cache(parallelism=args.parallelism) to_pull = [] for _, component in config.components.items(): if not component.clone: continue source_path = os.path.join(config.sources_dir, component.name) if not os.path.exists(source_path): continue to_pull.append(component) if to_pull: logger.info("Updating repositories") progress_bar = tqdm(to_pull, unit="components") for component in progress_bar: source_path = os.path.join(config.sources_dir, component.name) logger.debug(f"Pulling {component.name}") progress_bar.set_postfix_str(f"{component.name}") if not is_root_of_git_repo(source_path): failed_pulls.append(f"Repository {component.name}: Directory {source_path} is not a git repo") continue if not git_pull(source_path): failed_pulls.append(f"Repository {component.name}") if failed_pulls: formatted_failed_pulls = "\n".join([f" - {repo}" for repo in failed_pulls]) # Note: f-strings don't account for indentation, using a template is more practical failed_git_pull_template = dedent( """ Could not git pull --ff-only the following repositories: {formatted_failed_pulls} Suggestions: - check your network connection - commit your work - `git pull --rebase`, to pull remote changes and apply your commits on top - `git push` your changes to the remotes """ ) failed_git_pull_suggestion = failed_git_pull_template.format(formatted_failed_pulls=formatted_failed_pulls) logger.error(failed_git_pull_suggestion) if failed_clones: formatted_failed_clones = "\n".join([f" - {repo}" for repo in failed_clones]) # Note: f-strings don't account for indentation, using a template is more practical failed_git_clone_template = dedent( """ Could not clone the following repositories: {formatted_failed_clones} Suggestions: - check your network connection - check your ssh and git configuration (try manually cloning the repositories) """ ) failed_git_clone_suggestion = failed_git_clone_template.format(formatted_failed_clones=formatted_failed_clones) logger.error(failed_git_clone_suggestion) if failed_ls_remotes: formatted_failed_ls_remotes = "\n".join([f" - {repo}" for repo in failed_ls_remotes]) # Note: f-strings don't account for indentation, using a template is more practical failed_git_clone_template = dedent( """ Could not find the following repositories in any remote: {formatted_failed_ls_remotes} You will not be able to install components that depend on them. """ ) failed_ls_remote_suggestion = failed_git_clone_template.format( formatted_failed_ls_remotes=formatted_failed_ls_remotes ) logger.info(failed_ls_remote_suggestion) if failed_pulls or failed_clones or failed_ls_remotes: return 1 else: return 0
def main(): fic_ids, fandom, headers, restart, idlist_is_csv, only_first_chap, output_dirpath = get_args( ) os.chdir(os.getcwd()) storycolumns = [ 'fic_id', 'title', 'author', 'author_key', 'rating', 'category', 'fandom', 'relationship', 'character', 'additional tags', 'language', 'published', 'status', 'status date', 'words', 'comments', 'kudos', 'bookmarks', 'hits', 'chapter_count', 'series', 'seriespart', 'seriesid', 'summary', 'preface_notes', 'afterword_notes' ] chaptercolumns = [ 'fic_id', 'title', 'summary', 'preface_notes', 'afterword_notes', 'chapter_num', 'chapter_title', 'paragraph_count' ] textcolumns = ['fic_id', 'chapter_id', 'para_id', 'text'] if not os.path.exists(workdir(output_dirpath, fandom)): os.mkdir(workdir(output_dirpath, fandom)) if not os.path.exists(contentdir(output_dirpath, fandom)): os.mkdir(contentdir(output_dirpath, fandom)) with open(storiescsv(output_dirpath, fandom), 'a') as f_out: storywriter = csv.writer(f_out) with open(chapterscsv(output_dirpath, fandom), 'a') as ch_out: chapterwriter = csv.writer(ch_out) with open(errorscsv(output_dirpath, fandom), 'a') as e_out: errorwriter = csv.writer(e_out) #does the csv already exist? if not, let's write a header row. if os.stat(storiescsv(output_dirpath, fandom)).st_size == 0: print('Writing a header row for the csv.') storywriter.writerow(storycolumns) if os.stat(chapterscsv(output_dirpath, fandom)).st_size == 0: print('Writing a header row for the csv.') chapterwriter.writerow(chaptercolumns) if idlist_is_csv: csv_fname = fic_ids[0] total_lines = 0 # Count fics remaining with open(csv_fname, 'r') as f_in: reader = csv.reader(f_in) for row in reader: if not row: continue total_lines += 1 # Scrape fics with open(csv_fname, 'r+') as f_in: reader = csv.reader(f_in) if restart is '': for row in tqdm(reader, total=total_lines, ncols=70): if not row: continue write_fic_to_csv(fandom, row[0], only_first_chap, storywriter, chapterwriter, errorwriter, storycolumns, chaptercolumns, headers, output_dirpath, write_whole_fics=True) else: found_restart = False for row in tqdm(reader, total=total_lines, ncols=70): if not row: continue found_restart = process_id( row[0], restart, found_restart) if found_restart: write_fic_to_csv( fandom, row[0], only_first_chap, storywriter, chapterwriter, errorwriter, storycolumns, chaptercolumns, headers, output_dirpath=output_dirpath, write_whole_fics=True) else: print('Skipping already processed fic') else: for fic_id in fic_ids: write_fic_to_csv(fandom, fic_id, only_first_chap, storywriter, chapterwriter, errorwriter, storycolumns, chaptercolumns, headers, output_dirpath=output_dirpath, write_whole_fics=True)
from sk_dataloader_inter import DatasetCells, CellTrainData import pandas as pd import torch from tqdm import tqdm from PIL import Image from imgaug import augmenters as iaa import numpy as np from skimage import exposure data = DatasetCells(pd.read_csv("../data/GenData/train_input_ids.csv"), pd.read_csv("../data/GenData/train_labels_ids.csv"), pd.read_csv("../data/GenData/train_inter_ids.csv")) print("Adjusting Exposure..") for j, sample in tqdm(enumerate(data), total=len(data)): img, label, inter = sample new_img = exposure.adjust_gamma(img, gamma=0.4, gain=0.9) new_img = Image.fromarray(new_img) label = Image.fromarray(label) inter = Image.fromarray(inter) new_img.save("../data/GenData/TrainData/images/" + str("%04d" % j) + "_exposure_.png") label.save("../data/GenData/TrainData/labels/" + str("%04d" % j) + "_exposure_.png") inter.save("../data/GenData/TrainData/watershed/" + str("%04d" % j) + "_exposure_.png") print("Finished..")
def evaluate_coco(test_loader, model): """ Evaluate. :param test_loader: DataLoader for test data :param model: model """ fig_test, ax_test = plt.subplots(figsize=(18, 15)) # Make sure it's in eval mode model.eval() # Lists to store detected and true boxes, labels, scores det_boxes = list() det_labels = list() det_scores = list() true_boxes = list() true_labels = list() #For CoCo results = [] with torch.no_grad(): # Batches for i, (images, boxes, labels, index) in enumerate(tqdm(test_loader, desc='Evaluating')): images = images.to(device) # Forward prop. predicted_locs, predicted_scores = model(images) # Detect objects in SSD output det_boxes_batch, det_labels_batch, det_scores_batch = model.detect_objects( predicted_locs, predicted_scores, min_score=0.1, max_overlap=0.45, top_k=50) # Evaluation MUST be at min_score=0.01, max_overlap=0.45, top_k=200 for fair comparision with the paper's results and other repos # Store this batch's results for mAP calculation boxes = [b.to(device) for b in boxes] labels = [l.to(device) for l in labels] for box_t, label_t, score_t, ids in zip( det_boxes_batch, det_labels_batch, det_scores_batch, index): for box, label, score in zip(box_t, label_t, score_t): bb = box.cpu().numpy().tolist() # if score.item() > 0.1 : results.append( {\ 'image_id': ids.item(), \ 'category_id': label.item(), \ 'bbox': [bb[0]*input_size[1], bb[1]*input_size[0], (bb[2]-bb[0])*input_size[1], (bb[3]-bb[1])*input_size[0]], \ 'score': score.item()} ) rstFile = os.path.join( checkpoint_root, './COCO_TEST_det_{:s}.json'.format(checkpoint_name)) write_result_coco(results, rstFile) # rstFile = os.path.join('./jobs/2019-03-26_16h07m_[SSDPed_512x640][KAISTPed_train-all-02]video_make_test_full/SSDPed_512x640_epoch_0022_det.json') try: cocoDt = cocoGt.loadRes(rstFile) imgIds = sorted(cocoGt.getImgIds()) cocoEval = COCOeval(cocoGt, cocoDt, annType) cocoEval.params.imgIds = imgIds cocoEval.params.catIds = [1] cocoEval.evaluate(0) cocoEval.accumulate() curPerf = cocoEval.summarize(0) cocoEval.draw_figure(ax_test, rstFile.replace('json', 'jpg')) #writer.add_scalars('LAMR/fppi', {'test': curPerf}, epoch) print('Recall: {:}'.format(1 - cocoEval.eval['yy'][0][-1])) except: import torchcv.utils.trace_error print('[Error] cannot evaluate by cocoEval. ')
# -*- coding: utf-8 -*- """ Created on Sun Dec 2 18:32:22 2018 @author: DC """ from tqdm import tqdm import pandas as pd import os os.chdir("F:\\758Bdata") sample = pd.DataFrame() data = pd.read_csv('dentist new.csv') text = data['text'].tolist() text.remove(text[0]) repeat = [] for i in tqdm(range(len(text))): temp = text[0] text.remove(text[0]) if temp in text: repeat.append(temp) repeat_pd = pd.DataFrame(repeat, columns=['text']) repeat_data = pd.merge(repeat_pd, data, how='inner', on='text') repeat_data.drop(repeat_data.columns[1], axis=1, inplace=True)
def get_answers(self, question:str, paragraphs:list): answers = None start_time = time.time() dataset, examples, features = convert_format(question, paragraphs, self.tokenizer, self.args) print("CONVERT에 걸린시간 : ", time.time()-start_time) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.batch_size) if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): model = torch.nn.DataParallel(self.model) else: model = self.model all_results = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } feature_indices = batch[3] outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): # TODO: i and feature_index are the same number! Simplify by removing enumerate? eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) temp_dir = 'temp' if not os.path.isdir(temp_dir): os.mkdir(temp_dir) output_prediction_file = os.path.join(temp_dir, "predictions_.json") output_nbest_file = os.path.join(temp_dir, "nbest_predictions_.json") output_null_log_odds_file = os.path.join(temp_dir, "null_odds_.json") #return all_results predictions = compute_predictions_logits( examples, features, all_results, self.args.n_best_size, self.args.max_answer_length, self.args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, self.args.verbose_logging, self.args.version_2_with_negative, self.args.null_score_diff_threshold, self.tokenizer, ) return predictions[QUESTION_ID]
users = msg['users'] msg = total_batch send_msg(s, msg) # send total_batch of train dataset # ## Real training process # In[21]: for e in range(epochs): for u in range(users): client_weights = recv_msg(s) ecg_client.load_state_dict(client_weights) ecg_client.eval() for i, data in enumerate(tqdm(train_loader, ncols=100, desc='Epoch '+str(e+1)+ '_' +str(u))): x, label = data x = x.to(device) label = label.to(device) optimizer.zero_grad() output = ecg_client(x) client_output = output.clone().detach().requires_grad_(True) msg = { 'client_output': client_output, 'label': label } send_msg(s, msg) client_grad = recv_msg(s) output.backward(client_grad) optimizer.step()
def heldout_transform(model, binned, data, transformed_neurons=None, progress_bar=True, **fit_kw): """ Transform each neuron's activity by holding it out of model fitting and applying warping functions fit to the remaining neurons. Parameters ---------- models : ShiftWarping or AffineWarping instance Model to fit binned : numpy.ndarray Array holding binned spike times (trials x num_timebins x neurons) data : SpikeData instance Raw spike times. transformed_neurons (optional) : array-like or ``None`` Indices of neurons that are transformed. If None, all neurons are transformed. fit_kw (optional) : dict Additional keyword arguments are passed to ``model.fit(...)``. Returns ------- aligned_data : SpikeData instance Transformed version of ``data`` where each neuron/unit is independently aligned. Raises ------ ValueError: If ``binned`` and ``data`` have inconsistent dimensions. Notes ----- Since a different model is fit for each neuron, the warping functions are not necessarily consistent across neurons in the returned data array. Thus, each neuron should be considered as having its own time axis. """ # broadcast keywords into dict, with model instances as keys fit_kw['verbose'] = False # data dimensions n_neurons = data.n_neurons n_trials = data.n_trials if (n_trials != binned.shape[0]) or (n_neurons != binned.shape[-1]): raise ValueError('Dimension mismatch. Binned data and spike data do ' 'not have the same number of neurons or trials.') # Allocate storage for held out spike times. trials, spiketimes, neurons = [], [], [] # Determine neurons to hold out and fit. if transformed_neurons is None: transformed_neurons = range(n_neurons) # Set up progress bar. if progress_bar: transformed_neurons = tqdm(transformed_neurons) # Hold out each neuron, fit models, and apply transform to heldout cell. for n in transformed_neurons: # Define training set. trainset = list(set(range(n_neurons)) - {n}) # Fit model. model.fit(binned[:, :, trainset], **fit_kw) # Apply warping to test set. w = model.transform(data.select_neurons([n])) # Store result. trials.extend(w.trials) spiketimes.extend(w.spiketimes) neurons.extend(np.full(len(w.trials), n).tolist()) # Package result into a SpikeData instance. return SpikeData(trials, spiketimes, neurons, data.tmin, data.tmax)
first_half = make_inference(I0, middle, exp=exp - 1) second_half = make_inference(middle, I1, exp=exp - 1) return [*first_half, middle, *second_half] if args.montage: left = w // 4 w = w // 2 if args.UHD: ph = ((h - 1) // 64 + 1) * 64 pw = ((w - 1) // 64 + 1) * 64 else: ph = ((h - 1) // 32 + 1) * 32 pw = ((w - 1) // 32 + 1) * 32 padding = (0, pw - w, 0, ph - h) pbar = tqdm(total=tot_frame) skip_frame = 1 if args.montage: lastframe = lastframe[:, left:left + w] write_buffer = Queue(maxsize=500) read_buffer = Queue(maxsize=500) _thread.start_new_thread(build_read_buffer, (args, read_buffer, videogen)) _thread.start_new_thread(clear_write_buffer, (args, write_buffer)) I1 = torch.from_numpy(np.transpose(lastframe, (2, 0, 1))).to( device, non_blocking=True).unsqueeze(0).float() / 255. I1 = F.pad(I1, padding) while True: frame = read_buffer.get() if frame is None: break
def train_stego(*, stegoanalyser: nn.Module, train_iterator: DataBatchIterator, val_iterator: DataBatchIterator, text_iterator: Iterator, n_epoch: int, stegoanalyser_opt: Optimizer, callbacks: Sequence[Callable] = None, logger: TBLogger, encoder: SigmoidTorchEncoder): criterion = F.binary_cross_entropy_with_logits callbacks = callbacks or [] for epoch in tqdm(range(n_epoch)): stegoanalyser_losses = [] with train_iterator as iterator: for real_batch, _ in iterator: batch_size = len(real_batch) labels = np.random.choice([0, 1], (batch_size, 1, 1, 1)) encoded_images = [] for image, label in zip(real_batch, labels): if label == 1: msg = bytes_to_bits(next(text_iterator)) key = generate_random_key(image.shape[1:], len(msg)) image = encoder.encode(transform_encoder(image), msg, key) image = inverse_transform_encoder(image) encoded_images.append(image) encoded_images = torch.stack(encoded_images) labels = torch.from_numpy(labels).float() # train stegoanalyzer stegoanalyser_opt.zero_grad() stegoanalyser_losses.append( process_batch(encoded_images.detach(), labels, stegoanalyser, criterion)) stegoanalyser_opt.step() with val_iterator as iterator: accuracy = [] for real_batch, _ in iterator: batch_size = len(real_batch) labels = np.random.choice([0, 1], batch_size) encoded_images = [] for image, label in zip(real_batch, labels): if label == 1: msg = bytes_to_bits(next(text_iterator)) key = generate_random_key(image.shape[1:], len(msg)) image = encoder.encode(transform_encoder(image), msg, key) image = inverse_transform_encoder(image) encoded_images.append(image) encoded_images = torch.stack(encoded_images) # evaluate stegoanalyzer out = inference_step(encoded_images, stegoanalyser).cpu().detach() out = torch.sigmoid(out) > 0.5 out = out.reshape(len(encoded_images)).numpy() accuracy_score = sklearn.metrics.accuracy_score(labels, out) accuracy.append(accuracy_score) mean_accuracy = np.mean(accuracy) print(f'validation accuracy score {mean_accuracy}') losses = {'Stegoanalyser loss': np.mean(stegoanalyser_losses), 'Val accuracy': mean_accuracy} logger.policies(losses, epoch) # run callbacks for callback in callbacks: callback(epoch)
from model_architecture_v_6 import * transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) train_data = [] val_data = [] test_data = [] with h5py.File('../split_aug_v4.h5', 'r') as gData: images = np.array(gData['train']['images']) labels = np.array(gData['train']['labels']) for i in tqdm(range(len(images))): train_data.append(tuple([transform(images[i]), labels[i]])) images = np.array(gData['val']['images']) labels = np.array(gData['val']['labels']) for i in tqdm(range(len(images))): val_data.append(tuple([transform(images[i]), labels[i]])) images = np.array(gData['test']['images']) labels = np.array(gData['test']['labels']) for i in tqdm(range(len(images))): test_data.append(tuple([transform(images[i]), labels[i]])) torch.manual_seed(300) random.shuffle(train_data) random.shuffle(val_data)
#rating for method1 r1 = Rating() #rating for method2 r2 = Rating() r1_result = np.zeros((ngames, 3)) r2_result = np.zeros((ngames, 3)) start1 = time.time() boardSize = 3 #two paramters for mcts cp = 1 N = 100 #time control for idtt time_limit = 5 idtt_depth = 5 #start play games for i in tqdm.tqdm(list(range(ngames))): #3 experiments using alphabeta with random or dijkstra evulation: method can choose 'random' and 'dijkstra' #uncomment following code if choose alphabeta VS alphabeta: random and dijkstra; dijkstra and dijkstra # if i%2==0: # #method1 moves first # result=alphabeta_randomVSdijkstra(method1='dijkstra',method2='dijkstra',depth1=3,depth2=4,size=boardSize,print_all=False,first=True) # else: # #method2 moves first # result=alphabeta_randomVSdijkstra(method1='dijkstra',method2='dijkstra',depth1=3,depth2=4,size=boardSize,print_all=False,first=False) #---------------------------------------------------------------------------------------------------------------------------------------------- # experimnets using idtt and alphabeta #uncomment if choose idtt VS alphabeta (meethod can choose 'random' or 'dijkstra') # if i%2==0: # #method1 moves first # result=idtt_alphabeta(method='dijkstra',idtt_depth=idtt_depth,depth2=3,size=boardSize,print_all=False,first=True,time_limit=time_limit)
def valid(self, dataloader, writer, ep): torch.cuda.empty_cache() self.model.eval() epoch_logs = { "gen_latent_loss": [], "gen_ref_loss": [], "disc_latent_loss": [], "disc_ref_loss": [], "style_latent_loss": [], "style_ref_loss": [], "diversity_latent_loss": [], "diversity_ref_loss": [], "cycle_latent_loss": [], "cycle_ref_loss": [] } for indx, data in tqdm(enumerate(dataloader)): img, og_domain, x1, x2, domain = data img = img.to(self.device) x1 = x1.to(self.device) x2 = x2.to(self.device) z1 = torch.normal( torch.tensor([0.5]).repeat(self.batch, self.latent_dim), 1).to(self.device) z2 = torch.normal( torch.tensor([0.5]).repeat(self.batch, self.latent_dim), 1).to(self.device) disc_loss, gen_loss, style_loss, diversity_loss, cycle_loss = self.model( img, domain, og_domain, z=(z1, z2), train=False) disc_loss2, gen_loss2, style_loss2, diversity_loss2, cycle_loss2 = self.model( img, domain, og_domain, x=(x1, x2), train=False) epoch_logs["gen_latent_loss"].append((gen_loss).item()) epoch_logs["gen_ref_loss"].append((gen_loss2).item()) epoch_logs["disc_latent_loss"].append((disc_loss).item()) epoch_logs["disc_ref_loss"].append((disc_loss2).item()) epoch_logs["style_latent_loss"].append((style_loss).item()) epoch_logs["style_ref_loss"].append((style_loss2).item()) epoch_logs["diversity_latent_loss"].append((diversity_loss).item()) epoch_logs["diversity_ref_loss"].append((diversity_loss2).item()) epoch_logs["cycle_latent_loss"].append((cycle_loss).item()) epoch_logs["cycle_ref_loss"].append((cycle_loss2).item()) for x, y in epoch_logs.items(): writer.add_scalar(f"val/{x}", np.array(y[-1:]), ep + indx) epoch_logs["gen_latent_loss"] = np.mean(epoch_logs["gen_latent_loss"]) epoch_logs["gen_ref_loss"] = np.mean(epoch_logs["gen_ref_loss"]) epoch_logs["disc_latent_loss"] = np.mean( epoch_logs["disc_latent_loss"]) epoch_logs["disc_ref_loss"] = np.mean(epoch_logs["disc_ref_loss"]) epoch_logs["style_latent_loss"] = np.mean( epoch_logs["style_latent_loss"]) epoch_logs["style_ref_loss"] = np.mean(epoch_logs["style_ref_loss"]) epoch_logs["diversity_latent_loss"] = np.mean( epoch_logs["diversity_latent_loss"]) epoch_logs["diversity_ref_loss"] = np.mean( epoch_logs["diversity_ref_loss"]) epoch_logs["cycle_latent_loss"] = np.mean( epoch_logs["cycle_latent_loss"]) epoch_logs["cycle_ref_loss"] = np.mean(epoch_logs["cycle_ref_loss"]) return epoch_logs
import json from tqdm import tqdm df = pd.read_excel('mapping_parsed_final_test_tableau.xlsx') #df = df.drop(['AUM'], axis =1) #print(df.head()) file = open('apikey.txt', 'r') api = file.read() lat = [] long = [] print('Geocoding locations..................................') for row in tqdm(range(0, df.shape[0])): response = urllib.request.urlopen( 'https://maps.googleapis.com/maps/api/geocode/json?address=' + df.iloc[row]['Area'].replace(' ', '+') + ',+' + df.iloc[row]['District'].replace(' ', '+') + ',' + df.iloc[row]['Region'].replace(' ', '+') + ',+' + 'Hong+Kong' + '&key=' + api) geocode_result = json.load(response) x = geocode_result['results'][0]['geometry']['location']['lat'] y = geocode_result['results'][0]['geometry']['location']['lng'] lat.append(x) long.append(y) print('All coordinates successfully generated via geocoding') lat = pd.Series(lat)
def run_experiment(self): """ Runs experiment train and evaluation iterations, saving the model and best val model and val model accuracy after each epoch :return: The summary current_epoch_losses from starting epoch to total_epochs. """ total_losses = { "train_acc": [], "train_loss": [], "val_acc": [], "val_loss": [] } # initialize a dict to keep the per-epoch metrics for i, epoch_idx in enumerate( range(self.starting_epoch, self.num_epochs)): epoch_start_time = time.time() current_epoch_losses = { "train_acc": [], "train_loss": [], "val_acc": [], "val_loss": [] } self.current_epoch = epoch_idx with tqdm.tqdm( total=len(self.train_data) ) as pbar_train: # create a progress bar for training for idx, (x, y) in enumerate(self.train_data): # get data batches loss, accuracy = self.run_train_iter( x=x, y=y) # take a training iter step current_epoch_losses["train_loss"].append( loss) # add current iter loss to the train loss list current_epoch_losses["train_acc"].append( accuracy) # add current iter acc to the train acc list pbar_train.update(1) pbar_train.set_description( "loss: {:.4f}, accuracy: {:.4f}".format( loss, accuracy)) with tqdm.tqdm( total=len(self.val_data) ) as pbar_val: # create a progress bar for validation for x, y in self.val_data: # get data batches loss, accuracy = self.run_evaluation_iter( x=x, y=y) # run a validation iter current_epoch_losses["val_loss"].append( loss) # add current iter loss to val loss list. current_epoch_losses["val_acc"].append( accuracy) # add current iter acc to val acc lst. pbar_val.update(1) # add 1 step to the progress bar pbar_val.set_description( "loss: {:.4f}, accuracy: {:.4f}".format( loss, accuracy)) val_mean_accuracy = np.mean(current_epoch_losses['val_acc']) if val_mean_accuracy > self.best_val_model_acc: # if current epoch's mean val acc is greater than the saved best val acc then self.best_val_model_acc = val_mean_accuracy # set the best val model acc to be current epoch's val accuracy self.best_val_model_idx = epoch_idx # set the experiment-wise best val idx to be the current epoch's idx for key, value in current_epoch_losses.items(): total_losses[key].append( np.mean(value) ) # get mean of all metrics of current epoch metrics dict, to get them ready for storage and output on the terminal. save_statistics(experiment_log_dir=self.experiment_logs, filename='summary.csv', stats_dict=total_losses, current_epoch=i, continue_from_mode=True if (self.starting_epoch != 0 or i > 0) else False) # save statistics to stats file. # load_statistics(experiment_log_dir=self.experiment_logs, filename='summary.csv') # How to load a csv file if you need to out_string = "_".join([ "{}_{:.4f}".format(key, np.mean(value)) for key, value in current_epoch_losses.items() ]) # create a string to use to report our epoch metrics epoch_elapsed_time = time.time( ) - epoch_start_time # calculate time taken for epoch epoch_elapsed_time = "{:.4f}".format(epoch_elapsed_time) print("Epoch {}:".format(epoch_idx), out_string, "epoch time", epoch_elapsed_time, "seconds") self.state['model_epoch'] = epoch_idx self.save_model( model_save_dir=self.experiment_saved_models, # save model and best val idx and best val acc, using the model dir, model name and model idx model_save_name="train_model", model_idx=epoch_idx, best_validation_model_idx=self.best_val_model_idx, best_validation_model_acc=self.best_val_model_acc) self.save_model( model_save_dir=self.experiment_saved_models, # save model and best val idx and best val acc, using the model dir, model name and model idx model_save_name="train_model", model_idx='latest', best_validation_model_idx=self.best_val_model_idx, best_validation_model_acc=self.best_val_model_acc) ################################################################ ##### Plot Gradient Flow at each Epoch during Training ###### print( "Generating Gradient Flow Plot at epoch {}".format(epoch_idx)) plt = self.plot_grad_flow(self.model.named_parameters()) if not os.path.exists( os.path.join(self.experiment_saved_models, 'gradient_flow_plots')): os.mkdir( os.path.join(self.experiment_saved_models, 'gradient_flow_plots')) # plt.legend(loc="best") plt.savefig( os.path.join(self.experiment_saved_models, 'gradient_flow_plots', "epoch{}.pdf".format(str(epoch_idx)))) ################################################################ print("Generating test set evaluation metrics") self.load_model( model_save_dir=self.experiment_saved_models, model_idx=self.best_val_model_idx, # load best validation model model_save_name="train_model") current_epoch_losses = { "test_acc": [], "test_loss": [] } # initialize a statistics dict with tqdm.tqdm( total=len(self.test_data)) as pbar_test: # ini a progress bar for x, y in self.test_data: # sample batch loss, accuracy = self.run_evaluation_iter( x=x, y=y ) # compute loss and accuracy by running an evaluation step current_epoch_losses["test_loss"].append( loss) # save test loss current_epoch_losses["test_acc"].append( accuracy) # save test accuracy pbar_test.update(1) # update progress bar status pbar_test.set_description( "loss: {:.4f}, accuracy: {:.4f}".format( loss, accuracy)) # update progress bar string output test_losses = { key: [np.mean(value)] for key, value in current_epoch_losses.items() } # save test set metrics in dict format save_statistics( experiment_log_dir=self.experiment_logs, filename='test_summary.csv', # save test set metrics on disk in .csv format stats_dict=test_losses, current_epoch=0, continue_from_mode=False) return total_losses, test_losses
else: print("wrong path or files") return picture wsi_set = get_picture(wsi_path) csvfile = open('normal_bounding_boxes_in_tumor_wsi.csv', 'w') fieldnames = ['wsi', 'bounding_boxes', 'patch_index'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() wsioptions = WSIOps() total_index = 0 for i in tqdm(range(len(wsi_set))): #for i in tqdm(range(len(wsi_set))): wsi_mask, mask_image = wsioptions.read_wsi_mask(base_path + mask_path + (os.path.splitext(wsi_set[i]))[0] + "_Mask.tif") wsi_image, rgb_image, _, _, _ = wsioptions.read_wsi_tumor(base_path + wsi_path + wsi_set[i], base_path + mask_path + (os.path.splitext(wsi_set[i]))[0] + "_Mask.tif") bounding_boxes, rgb_contour, image_open = wsioptions.find_roi_bbox(rgb_image) #bounding_boxes = wsioptions.find_roi_bbox_tumor_gt_mask(mask_image) print('%s bianjie' % os.path.splitext(wsi_set[i])[0], bounding_boxes) # writer.writerow({'wsi':os.path.splitext(wsi_set[i])[0], 'bounding_boxes':bounding_boxes}) # print('saved successfully!!') level_used = wsi_mask.level_count - 1 patchex = PatchExtractor() patch_index = patchex.extract_negative_patches_from_tumor_wsi(wsi_image, mask_image, image_open, level_used, bounding_boxes, patch_save_dir='normal_patches/', patch_prefix=(os.path.splitext(wsi_set[i]))[0] + '_', patch_index=0) print('last numbers:', total_index) print('new generate patches:', patch_index)
def simple_use_plan_06(kwargs) -> AllocationPlan: atcs = kwargs["atcs"] devices = kwargs["ds"] max_hop = kwargs["max_hop"] congestion_scope = kwargs["congestion_scope"] t_len = len(atcs) y_len = len(atcs[0]) x_len = len(atcs[0][0]) allocation_plan = create_blank_allocation_plan(atcs, devices) for time, cloudlets in enumerate(tqdm(atcs)): ds = list(filter(lambda d: d.is_poweron(time), devices)) reqapp = ["1","2","3"] # reqapp ごとにデバイスを分ける dsa1 = list(filter(lambda d: d.appret(reqapp[0]), ds)) dsa2 = list(filter(lambda d: d.appret(reqapp[1]), ds)) dsa3 = list(filter(lambda d: d.appret(reqapp[2]), ds)) # 要求数が多い順番を特定 ds_app_len = [ len(dsa1), len(dsa2), len(dsa3) ] max_len = [0,0,0] for i in range(0,3): max_len[i] = str(np.argmax(ds_app_len)) ds_app_len[int(max_len[i])] = 0 # app ごとの混雑度を計測 congestion_mapa1 = simple_create_congestion_map(time, x_len, y_len, dsa1, congestion_scope) congestion_mapa2 = simple_create_congestion_map(time, x_len, y_len, dsa2, congestion_scope) congestion_mapa3 = simple_create_congestion_map(time, x_len, y_len, dsa3, congestion_scope) # app ごとの混雑度と近傍の利用可能cloudletの set_app_pri(atcs, congestion_mapa1, time, dsa1, x_len, y_len, max_len, congestion_scope) set_app_pri(atcs, congestion_mapa2, time, dsa2, x_len, y_len, max_len, congestion_scope) set_app_pri(atcs, congestion_mapa3, time, dsa3, x_len, y_len, max_len, congestion_scope) ds = sorted(ds, key=lambda d: d.ds_pri, reverse=False) for d in ds: # step1 前回と同じ場所に置くことが適切かどうか判定し、適切なら配置を試行する now_pos = d.get_pos(time) if d.startup_time != time: prev_pos = d.get_allocation_point(time - 1) if distance(prev_pos, now_pos) <= max_hop: # 前回と同じ場所に置くことを試行する if cloudlets[prev_pos.y][prev_pos.x].can_append_device(d, True): allocate(d, time, prev_pos, allocation_plan, cloudlets) continue # step2 if d.is_poweron(time + max_hop): next_pos = d.get_pos(time + max_hop) else: next_pos = d.get_pos(d.shutdown_time - 1) for hop in range(max_hop, 30): nps = near_points(now_pos, hop, Point(x_len - 1, y_len - 1), Point(0, 0)) nps = sorted(nps, key=lambda p: distance(p, next_pos)) tp, index = search(nps, True, key=lambda p: cloudlets[p.y][p.x].can_append_device(d, True)) if index == -1: continue allocate(d, time, tp, allocation_plan, cloudlets) break else: # どこにも割当られなかった場合 allocation = Allocation(now_pos.x, now_pos.y, -1) allocation_plan[d.name][time] = allocation d.set_allocation_point(time, allocation) print("allocation failed", d.name, time) return allocation_plan
def train(self, dataloader, writer, ep): torch.cuda.empty_cache() self.model.train() epoch_logs = { "gen_latent_loss": [], "gen_ref_loss": [], "disc_latent_loss": [], "disc_ref_loss": [], "style_latent_loss": [], "style_ref_loss": [], "diversity_latent_loss": [], "diversity_ref_loss": [], "cycle_latent_loss": [], "cycle_ref_loss": [] } for indx, data in tqdm(enumerate(dataloader)): img, og_domain, x1, x2, domain = data img = img.to(self.device) x1 = x1 x2 = x2 z1 = torch.normal( torch.tensor([0.5]).repeat(self.batch, self.latent_dim), 1) z2 = torch.normal( torch.tensor([0.5]).repeat(self.batch, self.latent_dim), 1) disc_loss, gen_loss, style_loss, diversity_loss, cycle_loss = self.model( img, domain, og_domain, z=(z1, z2)) self.model.dsc_optim.zero_grad() disc_loss.backward(retain_graph=True) self.model.map_optim.zero_grad() self.model.style_optim.zero_grad() self.model.gen_optim.zero_grad() gen_loss.backward() self.model.dsc_optim.step() self.model.gen_optim.step() self.model.map_optim.step() self.model.style_optim.step() disc_loss2, gen_loss2, style_loss2, diversity_loss2, cycle_loss2 = self.model( img, domain, og_domain, x=(x1, x2)) self.model.dsc_optim.zero_grad() disc_loss2.backward(retain_graph=True) self.model.gen_optim.zero_grad() gen_loss2.backward() self.model.dsc_optim.step() self.model.gen_optim.step() epoch_logs["gen_latent_loss"].append((gen_loss).item()) epoch_logs["gen_ref_loss"].append((gen_loss2).item()) epoch_logs["disc_latent_loss"].append((disc_loss).item()) epoch_logs["disc_ref_loss"].append((disc_loss2).item()) epoch_logs["style_latent_loss"].append((style_loss).item()) epoch_logs["style_ref_loss"].append((style_loss2).item()) epoch_logs["diversity_latent_loss"].append((diversity_loss).item()) epoch_logs["diversity_ref_loss"].append((diversity_loss2).item()) epoch_logs["cycle_latent_loss"].append((cycle_loss).item()) epoch_logs["cycle_ref_loss"].append((cycle_loss2).item()) for x, y in epoch_logs.items(): writer.add_scalar(f"train/{x}", np.array(y[-1:]), ep + indx) epoch_logs["gen_latent_loss"] = np.mean(epoch_logs["gen_latent_loss"]) epoch_logs["gen_ref_loss"] = np.mean(epoch_logs["gen_ref_loss"]) epoch_logs["disc_latent_loss"] = np.mean( epoch_logs["disc_latent_loss"]) epoch_logs["disc_ref_loss"] = np.mean(epoch_logs["disc_ref_loss"]) epoch_logs["style_latent_loss"] = np.mean( epoch_logs["style_latent_loss"]) epoch_logs["style_ref_loss"] = np.mean(epoch_logs["style_ref_loss"]) epoch_logs["diversity_latent_loss"] = np.mean( epoch_logs["diversity_latent_loss"]) epoch_logs["diversity_ref_loss"] = np.mean( epoch_logs["diversity_ref_loss"]) epoch_logs["cycle_latent_loss"] = np.mean( epoch_logs["cycle_latent_loss"]) epoch_logs["cycle_ref_loss"] = np.mean(epoch_logs["cycle_ref_loss"]) return epoch_logs
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--text_a', type=str, default='', help="input text_a.") parser.add_argument('--text_b', type=str, default='', help="input text_b.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)) model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels = num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) # Load a trained model and config that you have fine-tuned print('for eval only......................') output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): #eval_examples = processor.get_dev_examples(args.data_dir) #eval_examples = {'text_a':"He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .",'text_b':"The foodservice pie business does not fit our long-term growth strategy .",'label':'1','guid':'12345'} eval_examples = {'text_a':args.text_a,'text_b':args.text_b,'label':'1','guid':'1234'} print(eval_examples) #import pdb;pdb.set_trace() eval_features = convert_examples_to_features_pred(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() # convert logits to probability logits_prob = np.exp(logits)/(1+np.exp(logits)) print("================================") print("label is : {}".format(np.argmax(logits,axis=1))) print("confidence score : {}".format(np.max(logits_prob,axis=1))) print("================================") sys.exit(1) label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): cut_off = 0 parser = ArgumentParser() parser.add_argument( "--scrape-file-loc", help="location of scrape file to be sentence splitted", type=str, required=True, ) parser.add_argument( "--output-folder", help="location of output folder", type=str, required=True ) parser.add_argument( "--lang", help="language code :\ 'Kannada':'kn',\ 'Tamil':'ta',\ 'Marathi':'mr',\ 'Telugu':'te',\ 'Bengali':'bn',\ 'Gujarati':'gu',\ 'Malayalam':'ml',\ 'Punjabi':'pa',\ 'Assamese':'asm',\ 'Odia':'or',\ 'Urdu':'ur'", type=str, required=True, ) args = parser.parse_args() lang = args.lang look_up_dict = { "English": "en", "Hindi": "hi", "Kannada": "kn", "Tamil": "ta", "Marathi": "mr", "Telugu": "te", "Bengali": "bn", "Gujarati": "gu", "Malayalam": "ml", "Punjabi": "pa", "Assamese": "asm", "Odia": "or", "Urdu": "ur", } # Note: Inverting above dictionery look_up_dict = {v: k for k, v in look_up_dict.items()} if lang in look_up_dict.keys(): scrape_loc = args.scrape_file_loc csv_file_loc = ( args.output_folder ) # list_fl = '_'.join([look_up_dict[lang],n_month,n_year]) + '.csv' tokenize_loc = csv_file_loc + "//" + "tokenize_file_" + os.path.basename(os.path.normpath(scrape_loc)) # submit_aligner = csv_file_loc + '\\' + 'submit_aligner' if not os.path.exists(scrape_loc): print(f"Path dosent exists:{scrape_loc}") return create_directory(csv_file_loc) create_directory(tokenize_loc) else: print("Please enter the corrent langauge code") return total_sen_pd = pd.DataFrame(columns=[look_up_dict[lang] + "_sen"]) fl_list = sorted(glob.glob(os.path.join(scrape_loc, "*.txt"))) fl_list_rename = [ os.path.join( scrape_loc, "_".join( [ os.path.basename(i).split(".")[0].split()[0].zfill(5), *os.path.basename(i).split(".")[0].split()[1:6], ] ), ) + "." + os.path.basename(i).split(".")[-1] for i in fl_list ] for org, chg in zip(fl_list, fl_list_rename): os.rename(org, chg) fl_list = sorted(glob.glob(os.path.join(scrape_loc, "*.txt"))) old_count = 0 for k, fl in tqdm(enumerate(fl_list), total = len(fl_list)): if k < cut_off: continue # print(os.path.basename(fl)) # Read Scrape Content tok_flname = tokenize_loc + "//tok_" + os.path.basename(fl) with open(fl, mode="r", encoding="utf-16") as file_r: content = file_r.read() # print(content) # Cleaning scrape content paragraph = content.split("\n") content = [] for para in paragraph: para = para.strip() para = " ".join(para.split()) if len(para.split()) >= 4: if lang == "en": content.append(para) else: try: if detect(para) != "en": content.append(para) except: content.append(para) # Tokenizing paragraphs into sentences sentences = [] if lang != 'en': for entry in content: [sentences.append(tok_sen) for tok_sen in sentence_split(entry, lang)] else: for entry in content: [sentences.append(tok_sen) for tok_sen in sent_tokenize(entry)] # Removing Duplicates dump_1 = ( pd.DataFrame(sentences, columns=["sen"]) .drop_duplicates() .loc[:, "sen"] .values.tolist() ) sentences = dump_1 # Write sentence token with open(tok_flname, mode="w", encoding="utf-16") as file_w: for sen in sentences: sen = sen.strip() sen = sen.strip('"') if len(sen.split()) >= 4: file_w.write(sen + "\n") total_sen_pd = total_sen_pd.append( {look_up_dict[lang] + "_sen": sen.strip()}, ignore_index=True ) # print(f'Number of sentences found: {total_sen_pd.shape[0]-old_count}') old_count = total_sen_pd.shape[0] print(f"Total number of sentences found: {total_sen_pd.shape[0]}") total_sen_pd.drop_duplicates(inplace=True) print( f"Total number of sentences after removing duplicate: {total_sen_pd.shape[0]}" ) sys.stdout.flush() total_sen_pd.to_csv( csv_file_loc + "//" + "total_" + lang + "_sen_" + os.path.basename(os.path.normpath(scrape_loc)) + ".csv", index=False, encoding="utf-16", ) with open( csv_file_loc + "//" + "total_" + lang + "_sen_" + os.path.basename(os.path.normpath(scrape_loc)) + ".txt", mode="w", encoding="utf-16", ) as write_total: for line in total_sen_pd[look_up_dict[lang] + "_sen"].values.tolist(): write_total.write(line.strip() + "\n")
template = cv2.imread(template_path, cv2.IMREAD_UNCHANGED).astype( np.float32) / 255.0 template_mask = get_mask_from_image(template) category = os.path.splitext(template_name)[0] templates.append((template, template_mask, category)) MAX_PROCESS = args.max_process processes = [] load_path = args.random_data binary_annotation_lines_queue = mp.Queue() multiclass_annotation_lines_queue = mp.Queue() binary_annotation_lines = [] multiclass_annotation_lines = [] nb_imgs_generated = 0 t0_temp = time.time() pbar = tqdm(total=args.total_images) for idx, img_name in enumerate(img_names): p = mp.Process(target=generate_sample, args=(( targets_path, img_name, templates, probabilities_vector, positions_list, images_out_path, idx, binary_annotation_lines_queue, multiclass_annotation_lines_queue, data_out_path, load_path))) p.daemon = True p.start() processes.append(p) if len(processes) == MAX_PROCESS: for p in processes: p.join() pbar.update(1) binary_annotation_lines += binary_annotation_lines_queue.get() multiclass_annotation_lines += multiclass_annotation_lines_queue.get() nb_imgs_generated += 1
def simple_use_plan_02(kwargs) -> AllocationPlan: atcs = kwargs["atcs"] devices = kwargs["ds"] max_hop = kwargs["max_hop"] congestion_scope = kwargs["congestion_scope"] reqapp=["1", "2", "3"] t_len = len(atcs) y_len = len(atcs[0]) x_len = len(atcs[0][0]) # cloudlets の空計画の作成 allocation_plan = create_blank_allocation_plan(atcs, devices) for time, cloudlets in enumerate(tqdm(atcs)): # ds 終了していないデバイスを集める ds = list(filter(lambda d: d.is_poweron(time), devices)) #ds = sorted(ds,lambda d: d.app_name,reversed=True) dsa1 = list(filter(lambda d: d.appret(reqapp[0]), ds)) dsa2 = list(filter(lambda d: d.appret(reqapp[1]), ds)) dsa3 = list(filter(lambda d: d.appret(reqapp[2]), ds)) a1_len = len(dsa1) a2_len = len(dsa2) a3_len = len(dsa3) # # 混雑度マップの作成 congestion_mapa1 = simple_create_congestion_map(time, x_len, y_len, dsa1, congestion_scope) congestion_mapa2 = simple_create_congestion_map(time, x_len, y_len, dsa2, congestion_scope) congestion_mapa3 = simple_create_congestion_map(time, x_len, y_len, dsa3, congestion_scope) # # # # print_congestion(congestion_map, x_len, y_len) dsa1 = sorted(dsa1, key=lambda d: congestion_mapa1[d.get_pos(time).y][d.get_pos(time).x], reverse=True) dsa2 = sorted(dsa2, key=lambda d: congestion_mapa2[d.get_pos(time).y][d.get_pos(time).x], reverse=True) dsa3 = sorted(dsa3, key=lambda d: congestion_mapa3[d.get_pos(time).y][d.get_pos(time).x], reverse=True) # if(a1_len > a2_len): # if(a1_len > a3_len): # if(a2_len > a3_len): # ds = dsa2 + dsa1 + dsa3 # else: # ds = dsa3 + dsa1 + dsa2 # else: # ds = dsa1 + dsa3 + dsa2 # else: # if(a1_len > a3_len): # ds = dsa1 + dsa2 + dsa3 # else: # if(a2_len > a3_len): # ds = dsa2 + dsa3 + dsa1 # else: # ds = dsa3 + dsa2 + dsa1 #ds = dsa2 + dsa1 + dsa3 ds = dsa1 + dsa3 + dsa2 # congestion_map = simple_create_congestion_map(time, x_len, y_len, ds, congestion_scope) # ds_high = list(filter(lambda d: congestion_map[d.get_pos(time).y][d.get_pos(time).x] > 3), ds) # ds_low = list(filter(lambda d: congestion_map[d.get_pos(time).y][d.get_pos(time).x] < 3), ds) # ds = ds_high + ds_low for d in ds: # step1 前回と同じ場所に置くことが適切かどうか判定し、適切なら配置を試行する now_pos = d.get_pos(time) if d.startup_time != time: prev_pos = d.get_allocation_point(time - 1) if distance(prev_pos, now_pos) <= max_hop: # 前回と同じ場所に置くことを試行する if cloudlets[prev_pos.y][prev_pos.x].can_append_device(d, True): allocate(d, time, prev_pos, allocation_plan, cloudlets) continue # step2 if d.is_poweron(time + max_hop): next_pos = d.get_pos(time + max_hop) else: next_pos = d.get_pos(d.shutdown_time - 1) for hop in range(max_hop, 30): nps = near_points(now_pos, hop, Point(x_len - 1, y_len - 1), Point(0, 0)) nps = sorted(nps, key=lambda p: distance(p, next_pos)) tp, index = search(nps, True, key=lambda p: cloudlets[p.y][p.x].can_append_device(d, True)) if index == -1: continue allocate(d, time, tp, allocation_plan, cloudlets) break else: # どこにも割当られなかった場合 allocation = Allocation(now_pos.x, now_pos.y, -1) allocation_plan[d.name][time] = allocation d.set_allocation_point(time, allocation) print("allocation failed", d.name, time) return allocation_plan
def kmean_anchors(path='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True): """ Creates kmeans-evolved anchors from training dataset Arguments: path: path to dataset *.yaml, or a loaded dataset n: number of anchors img_size: image size used for training thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0 gen: generations to evolve anchors using genetic algorithm verbose: print all results Return: k: kmeans evolved anchors Usage: from utils.autoanchor import *; _ = kmean_anchors() """ thr = 1. / thr prefix = colorstr('autoanchor: ') def metric(k, wh): # compute metrics r = wh[:, None] / k[None] x = torch.min(r, 1. / r).min(2)[0] # ratio metric # x = wh_iou(wh, torch.tensor(k)) # iou metric return x, x.max(1)[0] # x, best_x def anchor_fitness(k): # mutation fitness _, best = metric(torch.tensor(k, dtype=torch.float32), wh) return (best * (best > thr).float()).mean() # fitness def print_results(k): k = k[np.argsort(k.prod(1))] # sort small to large x, best = metric(k, wh0) bpr, aat = (best > thr).float().mean(), ( x > thr).float().mean() * n # best possible recall, anch > thr print( f'{prefix}thr={thr:.2f}: {bpr:.4f} best possible recall, {aat:.2f} anchors past thr' ) print( f'{prefix}n={n}, img_size={img_size}, metric_all={x.mean():.3f}/{best.mean():.3f}-mean/best, ' f'past_thr={x[x > thr].mean():.3f}-mean: ', end='') for i, x in enumerate(k): print('%i,%i' % (round(x[0]), round(x[1])), end=', ' if i < len(k) - 1 else '\n') # use in *.cfg return k if isinstance(path, str): # *.yaml file with open(path) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # model dict from utils.datasets import LoadImagesAndLabels dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True) else: dataset = path # dataset # Get label wh shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True) wh0 = np.concatenate( [l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)]) # wh # Filter i = (wh0 < 3.0).any(1).sum() if i: print( f'{prefix}WARNING: Extremely small objects found. {i} of {len(wh0)} labels are < 3 pixels in size.' ) wh = wh0[(wh0 >= 2.0).any(1)] # filter > 2 pixels # wh = wh * (np.random.rand(wh.shape[0], 1) * 0.9 + 0.1) # multiply by random scale 0-1 # Kmeans calculation print(f'{prefix}Running kmeans for {n} anchors on {len(wh)} points...') s = wh.std(0) # sigmas for whitening k, dist = kmeans(wh / s, n, iter=30) # points, mean distance k *= s wh = torch.tensor(wh, dtype=torch.float32) # filtered wh0 = torch.tensor(wh0, dtype=torch.float32) # unfiltered k = print_results(k) # Plot # k, d = [None] * 20, [None] * 20 # for i in tqdm(range(1, 21)): # k[i-1], d[i-1] = kmeans(wh / s, i) # points, mean distance # fig, ax = plt.subplots(1, 2, figsize=(14, 7), tight_layout=True) # ax = ax.ravel() # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.') # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # plot wh # ax[0].hist(wh[wh[:, 0]<100, 0],400) # ax[1].hist(wh[wh[:, 1]<100, 1],400) # fig.savefig('wh.png', dpi=200) # Evolve npr = np.random f, sh, mp, s = anchor_fitness( k), k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma pbar = tqdm(range(gen), desc=f'{prefix}Evolving anchors with Genetic Algorithm:' ) # progress bar for _ in pbar: v = np.ones(sh) while (v == 1 ).all(): # mutate until a change occurs (prevent duplicates) v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0) kg = (k.copy() * v).clip(min=2.0) fg = anchor_fitness(kg) if fg > f: f, k = fg, kg.copy() pbar.desc = f'{prefix}Evolving anchors with Genetic Algorithm: fitness = {f:.4f}' if verbose: print_results(k) return print_results(k)
hdf5_path = "/home/mil/gupta/ifood18/data/h5data/test_data.h5py" # open a hdf5 file and create earrays hdf5_file = h5py.File(hdf5_path, mode='w') hdf5_file.create_dataset("data", test_shape, np.float32) hdf5_file.create_dataset("mean", test_shape[1:], np.float32) #hdf5_file.create_dataset("labels", (len(test_data),), np.int32) #hdf5_file["labels"][...] = val_labels from tqdm import tqdm # a numpy array to save the mean of the images mean = np.zeros(test_shape[1:], np.float32) # loop over train addresses for i in tqdm(range(len(test_data))): addr = os.path.join(test_data_path,test_data[i]) #print("image addres is :",addr) img = cv2.imread(addr) #print(img) img = cv2.resize(img, (image_size, image_size), interpolation=cv2.INTER_CUBIC) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) #current_image = ia.imresize_single_image(addr, (256, 256)) image_aug = img.transpose(2,0,1) hdf5_file["data"][i, ...] = image_aug[None] mean += image_aug / float(len(test_data))
def undistort_points(config, dataframe, camera_pair, destfolder): cfg_3d = auxiliaryfunctions.read_config(config) img_path, path_corners, path_camera_matrix, path_undistort = auxiliaryfunctions_3d.Foldernames3Dproject( cfg_3d) ''' path_undistort = destfolder filename_cam1 = Path(dataframe[0]).stem filename_cam2 = Path(dataframe[1]).stem #currently no interm. saving of this due to high speed. # check if the undistorted files are already present if os.path.exists(os.path.join(path_undistort,filename_cam1 + '_undistort.h5')) and os.path.exists(os.path.join(path_undistort,filename_cam2 + '_undistort.h5')): print("The undistorted files are already present at %s" % os.path.join(path_undistort,filename_cam1)) dataFrame_cam1_undistort = pd.read_hdf(os.path.join(path_undistort,filename_cam1 + '_undistort.h5')) dataFrame_cam2_undistort = pd.read_hdf(os.path.join(path_undistort,filename_cam2 + '_undistort.h5')) else: ''' if True: # Create an empty dataFrame to store the undistorted 2d coordinates and likelihood dataframe_cam1 = pd.read_hdf(dataframe[0]) dataframe_cam2 = pd.read_hdf(dataframe[1]) scorer_cam1 = dataframe_cam1.columns.get_level_values(0)[0] scorer_cam2 = dataframe_cam2.columns.get_level_values(0)[0] stereo_file = auxiliaryfunctions.read_pickle( os.path.join(path_camera_matrix, 'stereo_params.pickle')) path_stereo_file = os.path.join(path_camera_matrix, 'stereo_params.pickle') stereo_file = auxiliaryfunctions.read_pickle(path_stereo_file) mtx_l = stereo_file[camera_pair]['cameraMatrix1'] dist_l = stereo_file[camera_pair]['distCoeffs1'] mtx_r = stereo_file[camera_pair]['cameraMatrix2'] dist_r = stereo_file[camera_pair]['distCoeffs2'] R1 = stereo_file[camera_pair]['R1'] P1 = stereo_file[camera_pair]['P1'] R2 = stereo_file[camera_pair]['R2'] P2 = stereo_file[camera_pair]['P2'] # Create an empty dataFrame to store the undistorted 2d coordinates and likelihood dataFrame_cam1_undistort, scorer_cam1, bodyparts = auxiliaryfunctions_3d.create_empty_df( dataframe_cam1, scorer_cam1, flag='2d') dataFrame_cam2_undistort, scorer_cam2, bodyparts = auxiliaryfunctions_3d.create_empty_df( dataframe_cam2, scorer_cam2, flag='2d') for bpindex, bp in tqdm(enumerate(bodyparts)): # Undistorting the points from cam1 camera points_cam1 = np.array([ dataframe_cam1[scorer_cam1][bp]['x'].values[:], dataframe_cam1[scorer_cam1][bp]['y'].values[:] ]) points_cam1 = points_cam1.T points_cam1 = np.expand_dims(points_cam1, axis=1) points_cam1_remapped = cv2.undistortPoints(src=points_cam1, cameraMatrix=mtx_l, distCoeffs=dist_l, P=P1, R=R1) dataFrame_cam1_undistort.iloc[:][scorer_cam1, bp, 'x'] = points_cam1_remapped[:, 0, 0] dataFrame_cam1_undistort.iloc[:][scorer_cam1, bp, 'y'] = points_cam1_remapped[:, 0, 1] dataFrame_cam1_undistort.iloc[:][ scorer_cam1, bp, 'likelihood'] = dataframe_cam1[scorer_cam1][ bp]['likelihood'].values[:] # Undistorting the points from cam2 camera points_cam2 = np.array([ dataframe_cam2[scorer_cam2][bp]['x'].values[:], dataframe_cam2[scorer_cam2][bp]['y'].values[:] ]) points_cam2 = points_cam2.T points_cam2 = np.expand_dims(points_cam2, axis=1) points_cam2_remapped = cv2.undistortPoints(src=points_cam2, cameraMatrix=mtx_r, distCoeffs=dist_r, P=P2, R=R2) dataFrame_cam2_undistort.iloc[:][scorer_cam2, bp, 'x'] = points_cam2_remapped[:, 0, 0] dataFrame_cam2_undistort.iloc[:][scorer_cam2, bp, 'y'] = points_cam2_remapped[:, 0, 1] dataFrame_cam2_undistort.iloc[:][ scorer_cam2, bp, 'likelihood'] = dataframe_cam2[scorer_cam2][ bp]['likelihood'].values[:] # Save the undistorted files dataFrame_cam1_undistort.sort_index(inplace=True) dataFrame_cam2_undistort.sort_index(inplace=True) return (dataFrame_cam1_undistort, dataFrame_cam2_undistort, stereo_file[camera_pair], path_stereo_file)
def distance_matrix(s, max_dist=None, max_length_diff=None, window=None, max_step=None, penalty=None, psi=None, block=None, parallel=False, use_c=False, use_nogil=False, show_progress=False): """Distance matrix for all sequences in s. :param s: Iterable of series :param window: see :meth:`distance` :param max_dist: see :meth:`distance` :param max_step: see :meth:`distance` :param max_length_diff: see :meth:`distance` :param penalty: see :meth:`distance` :param psi: see :meth:`distance` :param block: Only compute block in matrix. Expects tuple with begin and end, e.g. ((0,10),(20,25)) will only compare rows 0:10 with rows 20:25. :param parallel: Use parallel operations :param use_c: Use c compiled Python functions (it is recommended to use use_nogil) :param use_nogil: Use pure c functions :param show_progress: Show progress using the tqdm library """ if parallel and (not use_c or not use_nogil): try: import multiprocessing as mp logger.info('Using multiprocessing') except ImportError: parallel = False mp = None else: mp = None dist_opts = { 'max_dist': max_dist, 'max_step': max_step, 'window': window, 'max_length_diff': max_length_diff, 'penalty': penalty, 'psi': psi } s = SeriesContainer.wrap(s) dists = None if max_length_diff is None: max_length_diff = np.inf large_value = np.inf logger.info('Computing distances') if use_c: for k, v in dist_opts.items(): if v is None: dist_opts[k] = 0.0 if use_c and use_nogil: logger.info("Compute distances in pure C") dist_opts['block'] = block if parallel: logger.info("Use parallel computation") dists = dtw_c.distance_matrix_nogil_p(s, **dist_opts) else: logger.info("Use serial computation") dists = dtw_c.distance_matrix_nogil(s, **dist_opts) if use_c and not use_nogil: logger.info("Compute distances in Python compiled C") if parallel: logger.info("Use parallel computation") dists = np.zeros((len(s), len(s))) + large_value if block is None: idxs = np.triu_indices(len(s), k=1) else: idxsl_r = [] idxsl_c = [] for r in range(block[0][0], block[0][1]): for c in range(max(r + 1, block[1][0]), min(len(s), block[1][1])): idxsl_r.append(r) idxsl_c.append(c) idxs = (np.array(idxsl_r), np.array(idxsl_c)) with mp.Pool() as p: dists[idxs] = p.map(_distance_c_with_params, [(s[r], s[c], dist_opts) for c, r in zip(*idxs)]) # pbar = tqdm(total=int((len(s)*(len(s)-1)/2))) # for r in range(len(s)): # dists[r,r+1:len(s)] = p.map(distance, [(s[r],s[c], dist_opts) for c in range(r+1,len(cur))]) # pbar.update(len(s) - r - 1) # pbar.close() else: logger.info("Use serial computation") dist_opts['block'] = block dists = dtw_c.distance_matrix(s, **dist_opts) if not use_c: logger.info("Compute distances in Python") if parallel: logger.info("Use parallel computation") dists = np.zeros((len(s), len(s))) + large_value if block is None: idxs = np.triu_indices(len(s), k=1) else: idxsl_r = [] idxsl_c = [] for r in range(block[0][0], block[0][1]): for c in range(max(r + 1, block[1][0]), min(len(s), block[1][1])): idxsl_r.append(r) idxsl_c.append(c) idxs = (np.array(idxsl_r), np.array(idxsl_c)) with mp.Pool() as p: dists[idxs] = p.map(_distance_with_params, [(s[r], s[c], dist_opts) for c, r in zip(*idxs)]) # pbar = tqdm(total=int((len(s)*(len(s)-1)/2))) # for r in range(len(s)): # dists[r,r+1:len(s)] = p.map(distance, [(s[r],s[c], dist_opts) for c in range(r+1,len(cur))]) # pbar.update(len(s) - r - 1) # pbar.close() else: logger.info("Use serial computation") dists = np.zeros((len(s), len(s))) + large_value if block is None: it_r = range(len(s)) else: it_r = range(block[0][0], block[0][1]) if show_progress: it_r = tqdm(it_r) for r in it_r: if block is None: it_c = range(r + 1, len(s)) else: it_c = range(max(r + 1, block[1][0]), min(len(s), block[1][1])) for c in it_c: if abs(len(s[r]) - len(s[c])) <= max_length_diff: dists[r, c] = distance(s[r], s[c], **dist_opts) return dists
def main(region, rawregion, data, parameter, npts): # region minlon, maxlon, minlat, maxlat, mindep, maxdep = [ float(item) for item in region.split("/") ] rawminlon, rawmaxlon, rawminlat, rawmaxlat, rawmindep, rawmaxdep = [ float(item) for item in rawregion.split("/") ] if (rawminlon > rawmaxlon): # interpolation: The points in dimension 0 must be strictly ascending rawminlon, rawmaxlon = rawmaxlon, rawminlon rawminlat, rawmaxlat = rawmaxlat, rawminlat # data data = np.load(data) # latnpts and lonnpts should be the same if plot vertically lonnpts, latnpts, depnpts = [int(item) for item in npts.split("/")] plot_vertically = True if (mindep == maxdep): plot_vertically = False hnpts, vnpts = None, None if (plot_vertically): if (lonnpts != latnpts): raise Exception( "latnpts and lonnpts should be the same when plotting vertically" ) hnpts = latnpts vnpts = depnpts print("plot vertically") else: hnpts = lonnpts vnpts = latnpts print("plot horizontally") print("preparing mesh:") lon_list, lat_list, dep_list = prepare_mesh(data, rawminlon, rawmaxlon, rawminlat, rawmaxlat, rawmindep, rawmaxdep) # get mesh to plot print("interp values:") lat_mesh, lon_mesh, dep_mesh = None, None, None plot_values = np.zeros((hnpts, vnpts)) array_to_interpolate = np.zeros((hnpts, vnpts, 3)) if (plot_vertically): lat_mesh = np.linspace(minlat, maxlat, hnpts) lon_mesh = np.linspace(minlon, maxlon, hnpts) dep_mesh = np.linspace(mindep, maxdep, vnpts) for ih in tqdm.tqdm(range(hnpts)): for iv in range(vnpts): # plot_values[ih, iv] = interp_value( # lat_mesh[ih], lon_mesh[ih], dep_mesh[iv], x_mesh, y_mesh, z_mesh, data) array_to_interpolate[ih, iv, :] = [ lon_mesh[ih], lat_mesh[ih], dep_mesh[iv] ] else: lat_mesh = np.linspace(minlat, maxlat, vnpts) lon_mesh = np.linspace(minlon, maxlon, hnpts) for ih in tqdm.tqdm(range(hnpts)): for iv in range(vnpts): # plot_values[ih, iv] = interp_value( # lat_mesh[iv], lon_mesh[ih], mindep, x_mesh, y_mesh, z_mesh, data) array_to_interpolate[ih, iv, :] = [ lon_mesh[iv], lat_mesh[ih], mindep ] # build up the interpolation function interpolating_function = RegularGridInterpolator( (lon_list, lat_list, dep_list), data, method="nearest") plot_values = interpolating_function(array_to_interpolate) # * plot figures print("start to plot") plt.figure() # get vmin and vmax vmin_round = round(np.min(plot_values), 2) if (vmin_round < np.min(plot_values)): vmin = vmin_round else: vmin = vmin_round - 0.01 vmax_round = round(np.max(plot_values), 2) if (vmax_round > np.max(plot_values)): vmax = vmax_round else: vmax = vmax_round + 0.01 # ! set vmin and vmax here # vmin = -0.03 # vmax = 0.03 v = np.arange(vmin, vmax, 0.01) if (plot_vertically): # decide to use lat or lon lat_diff = np.abs(maxlat - minlat) lon_diff = np.abs(maxlon - minlon) plot_on = None if (lat_diff >= lon_diff): mesh_plot_h, mesh_plot_v = np.meshgrid(lat_mesh, dep_mesh, indexing="ij") plot_on = "latitude" else: mesh_plot_h, mesh_plot_v = np.meshgrid(lon_mesh, dep_mesh, indexing="ij") plot_on = "longitude" plt.contourf(mesh_plot_h, mesh_plot_v, plot_values, resolution, cmap=plt.cm.seismic_r, vmin=vmin, vmax=vmax) plt.colorbar(ticks=v, label="perturbation") plt.gca().invert_yaxis() plt.xlabel( f"{plot_on}(°) between (lon: {minlon}°, lat: {minlat}°) and (lon: {maxlon}°, lat: {maxlat}°)" ) plt.ylabel("depth(km)") plt.title(f"parameter: {parameter}") plt.show() else: mesh_plot_h, mesh_plot_v = np.meshgrid(lon_mesh, lat_mesh, indexing="ij") plt.contourf(mesh_plot_h, mesh_plot_v, plot_values, resolution, cmap=plt.cm.seismic_r, vmin=vmin, vmax=vmax) plt.colorbar(ticks=v, label="perturbation") plt.gca().invert_yaxis() plt.ylabel(f"longitude(°) between {minlon}° and {maxlon}°") plt.ylabel(f"latitude(°) between {minlat}° and {maxlat}°") plt.title(f"depth: {mindep}km, parameter: {parameter}") plt.show()
def create_terms(self, obo_path, shortname, attrs=["def", "synonym", "subset", "alt_id", "dbxref"]): self.cache = {} ids = [] with open( obo_path ) as h: # filter the ids because GODag iterates over alt_ids too for l in h.readlines(): if l.startswith("id: " + shortname.upper() + ":"): ids.append(l.split(" ")[1].strip()) ids = set(ids) # PAra SO attrs = ["def", "subset", "dbxref", "alt_id"] go_dag = GODag(obo_path, load_obsolete=True, optional_attrs=attrs) finished = False pbar = iter(tqdm(ids)) while not finished: with transaction.atomic(): for _ in range(2000): try: go = next(pbar) if go not in go_dag: continue term = go_dag[go] if not Term.objects.filter(ontology=self.ontology, identifier=go).exists(): dbTerm = Term( name=term.name, definition=term.defn if hasattr(term, "defn") else "", identifier=go, is_obsolete="T" if term.is_obsolete else "F", ontology=self.ontology) dbTerm.save() if term.namespace: termdbref = TermDbxref( term=dbTerm, dbxref=Ontology.dbmap[term.namespace], rank=1) termdbref.save() for subset in term.subset: if subset in Ontology.dbmap: termdbref = TermDbxref( term=dbTerm, dbxref=Ontology.dbmap[subset], rank=1) termdbref.save() if hasattr(term, "synonym"): for synonym in term.synonym: TermSynonym.objects.get_or_create( term=dbTerm, synonym=synonym[0][:255]) for synonym in term.alt_ids: TermSynonym.objects.get_or_create( term=dbTerm, synonym=synonym[0][:255]) self.cache[go] = dbTerm else: self.cache[go] = Term.objects.filter( ontology=self.ontology, identifier=go).get() print("repeated: " + go) except StopIteration: finished = True