def main(): """ Iterates through the list of cyano datasets and colocalizes them with the specified environmentl variables. Colocalized datasets are stored in the "COLOCALIZED_DIR" as csv files. """ def saveColocalizedCSV(df): df.to_csv(f"{COLOCALIZED_DIR}{os.path.basename(cyanoFile)}", index=False) cyanoFiles = cyano_csv_files(DATA_DIR) api = pycmap.API(token=API_KEY) makedir(COLOCALIZED_DIR) envs = environmental_datasets() envs = add_env_temporal_coverage(api, envs) for cyanoFile in cyanoFiles: df = pd.read_csv(cyanoFile) df = add_env_columns(df, envs) dfs = [df.loc[i].to_frame().T for i in range(len(df))] colocalizedDF = pd.DataFrame({}) with concurrent.futures.ThreadPoolExecutor() as executor: futureObjs = executor.map(match, dfs, [api] * len(dfs), [envs] * len(dfs), [cyanoFile] * len(dfs), [len(dfs)] * len(dfs)) for fo in futureObjs: if len(colocalizedDF) < 1: colocalizedDF = fo else: colocalizedDF = pd.concat([colocalizedDF, fo], ignore_index=True) saveColocalizedCSV(colocalizedDF)
def generate_summary(self, title): makedir(RESULTS_FOLDER) self._compare_action_against_performance(title) if HUMAN_DATA_COMPARE: self._compare_score_against_human_data(title) self._compare_action_against_human_data(title, 1) self._plot_learning_curve(title)
def main(): """ Iterates through the list of colocalized cyano datasets and compile them into a single csv file. The compiled file is stored in the "COMPILED_DIR" as a csv file. """ print(""" ########################################################## # # # # # Compiling Colocalized Cyano Datasets # # # # # ########################################################## """) cyanoFiles = glob.glob(f"{COLOCALIZED_DIR}*.csv") makedir(COMPILED_DIR) dfCompiled = pd.DataFrame({}) for cyanoFile in cyanoFiles: print(f"Compiling {cyanoFile}") data = unify(cyanoFile) if len(dfCompiled) < 1: dfCompiled = data else: dfCompiled = pd.concat([dfCompiled, data], ignore_index=True) dfCompiled.to_csv(f"{COMPILED_DIR}compiled.csv", index=False)
def new_simulation(self): makedir(RESULTS_FOLDER ) # if created, the function will catch the exception self.current_df = pd.DataFrame(columns=COLUMNS) self.current_detail_df = pd.DataFrame(columns=DETAIL_COLUMNS) datetime_str = '{:%d-%H-%M-%S}'.format(datetime.datetime.now()) self.file_name = (lambda x: RESULTS_FOLDER + x + ' ' + datetime_str)
def downloadBiduPicture(keyword, startpage, endpage): # 创建文件夹 common.makedir(str(keyword)) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.61 Safari/537.36' } # 控制页数的循环 while (int(startpage) < int(endpage)): url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ie=utf-8&oe=utf-8&word=' + str( keyword) + '&pn=' + str(startpage) + '&rn=60' text = requests.get(url=url, headers=headers).text # print(text) pictureList = getBaiduUrlList(text) for count in range(0, len(pictureList)): try: name = str(keyword) + '_' + str(startpage) + '_' + str(count) type = '.jpg' # 多线程下载 mythread = threading.Thread(target=common.downloadPicture, args=(pictureList[count], name, type)) mythread.start() # 控制线程数目为64 if (threading.activeCount() >= 64): mythread.join() print('thread name:' + str(mythread.name)) print(str(threading.activeCount()) + 'actived thread') except: print(print('当前(第' + str(count) + ')图片下载超时,正在下载下一张')) # 控制循环 startpage += 1
def run_fuse(self): """ Run fusion. """ assert os.path.exists(self.options.depth_dir) common.makedir(self.options.out_dir) files = self.read_directory(self.options.depth_dir) timer = common.WallTimer() Rs = self.get_views() for filepath in files: # As rendering might be slower, we wait for rendering to finish. # This allows to run rendering and fusing in parallel (more or less). depths = common.read_hdf5(filepath) timer.reset() tsdf = self.fusion(depths, Rs) tsdf = tsdf[0] vertices, triangles = libmcubes.marching_cubes(-tsdf, 0) vertices /= self.options.resolution vertices -= 0.5 off_file = os.path.join(self.options.out_dir, ntpath.basename(filepath)[:-3]) exporter.export_off(vertices, triangles, off_file) print('[Data] wrote %s (%f seconds)' % (off_file, timer.elapsed()))
def cross_mode_summary(self, mode_lst=[mode for mode, _ in MODE_MAP.items()], subject_lst=None, subject_info=None): makedir(RESULTS_FOLDER) if subject_lst is not None: MODE_MAP[MODE_IDENTIFIER] = [None, None, 'black'] self._plot_p_mb(mode_lst, subject_lst, subject_info)
def get_in_files(self): if self.options.in_dir is not None: assert os.path.exists(self.options.in_dir) common.makedir(self.options.out_dir) files = self.read_directory(self.options.in_dir) else: files = [self.options.in_file] return files
def main(args): corpus_name = os.path.basename(args.corpus) if args.model: model_dir = args.model else: model_dir = os.path.join('.', corpus_name+'.model') makedir(model_dir) langs_path = os.path.join(model_dir, 'lang_index') domains_path = os.path.join(model_dir, 'domain_index') index_path = os.path.join(model_dir, 'paths') # display paths logging.info("corpus path: {0}".format(args.corpus)) logging.info("model path: {0}".format(model_dir)) logging.info("writing langs to: {0}".format(langs_path)) logging.info("writing domains to: {0}".format(domains_path)) logging.info("writing index to: {0}".format(index_path)) indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion, langs = args.lang, domains = args.domain) # Compute mappings between files, languages and domains lang_dist = indexer.dist_lang lang_index = indexer.lang_index lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items())) logging.info("langs({0}): {1}".format(len(lang_dist), lang_info)) domain_dist = indexer.dist_domain domain_index = indexer.domain_index domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items())) logging.info("domains({0}): {1}".format(len(domain_dist), domain_info)) logging.info("identified {0} files".format(len(indexer.items))) # output the language index with open(langs_path,'w') as f: writer = csv.writer(f) writer.writerows((l, lang_dist[lang_index[l]]) for l in sorted(lang_index.keys(), key=lang_index.get)) # output the domain index with open(domains_path,'w') as f: writer = csv.writer(f) writer.writerows((d, domain_dist[domain_index[d]]) for d in sorted(domain_index.keys(), key=domain_index.get)) # output items found with open(index_path,'w') as f: writer = csv.writer(f) writer.writerows( (d,l,p) for (d,l,n,p) in indexer.items )
def run(self): """ Run the tool, i.e. scale all found OFF files. """ assert os.path.exists(self.options.in_dir) common.makedir(self.options.out_dir) common.makedir('1_s_t') files = self.read_directory(self.options.in_dir) for filepath in files: mesh = common.Mesh.from_off(filepath) # Get extents of model. min, max = mesh.extents() total_min = np.min(np.array(min)) total_max = np.max(np.array(max)) # Set the center (although this should usually be the origin already). centers = ((min[0] + max[0]) / 2, (min[1] + max[1]) / 2, (min[2] + max[2]) / 2) # Scales all dimensions equally. sizes = (total_max - total_min, total_max - total_min, total_max - total_min) translation = (-centers[0], -centers[1], -centers[2]) scales = (1 / (sizes[0] + 2 * self.options.padding * sizes[0]), 1 / (sizes[1] + 2 * self.options.padding * sizes[1]), 1 / (sizes[2] + 2 * self.options.padding * sizes[2])) mesh.translate(translation) mesh.scale(scales) print scales, translation # print('[Data] %s extents before %f - %f, %f - %f, %f - %f' % (os.path.basename(filepath), min[0], max[0], min[1], max[1], min[2], max[2])) # min, max = mesh.extents() # print('[Data] %s extents after %f - %f, %f - %f, %f - %f' % (os.path.basename(filepath), min[0], max[0], min[1], max[1], min[2], max[2])) # May also switch axes if necessary. mesh.switch_axes(0, 2) mesh.to_off( os.path.join(self.options.out_dir, os.path.basename(filepath))) scipy.io.savemat( os.path.join('1_s_t', os.path.basename(filepath)).replace( '.off', '.mat'), { 'translation': translation, 'scales': scales, 'sizes': sizes })
def run(self): """ Run simplification. """ common.makedir(self.options.out_dir) files = self.get_in_files() for filepath in files: os.system( 'LC_NUMERIC=C meshlabserver -i %s -o %s -s %s' % (filepath, os.path.join(self.options.out_dir, ntpath.basename(filepath)), self.simplification_script))
def main(): """ Iterates through the list of datasets containing measurements of cyanobacteria. The measurements are retrieved and stored in individual csv files on local disk. """ api = pycmap.API(token=API_KEY) makedir(DATA_DIR) cyanos = cyano_datasets() for dataset in cyanos: print("\n********************************") print("Downloading ", dataset, " ...") print("********************************\n") data = retrieve(api, dataset, DEPTH1, DEPTH2) data.to_csv(f"{DATA_DIR}{dataset[0]}.csv", index=False)
def run(self): """ Run simplification. """ assert os.path.exists(self.options.in_dir) common.makedir(self.options.out_dir) files = self.read_directory(self.options.in_dir) for filepath in files: os.system( 'meshlabserver -i %s -o %s -s %s' % (filepath, os.path.join(self.options.out_dir, ntpath.basename(filepath)), self.simplification_script))
def get_in_files(self): if self.options.in_dir is not None: assert os.path.exists(self.options.in_dir) common.makedir(self.options.out_dir) files = self.read_directory(self.options.in_dir) else: files = [self.options.in_file] if not self.options.overwrite: def file_filter(filepath): outpath = self.get_outpath(filepath) return not os.path.exists(outpath) files = list(filter(file_filter, files)) return files
def _compare_action_against_human_data(self, title, num_comp=PCA_COMPONENTS): if not ACTION_COMPARE: return SAMPLE_ACTION_SEQUENCES = 50 NUMBER_OF_SAMPLE_SUBJECTS = 10 if not HEAD_AND_TAIL_SUBJECTS else 9 NUMBER_OF_TAIL_SUBJECTS = None if not HEAD_AND_TAIL_SUBJECTS else 9 makedir(RESULTS_FOLDER + 'Action_Summary/') file_name = lambda x: self.file_name('Action_Summary/' + x) sample_df = self.human_data_df.copy() # self._aggregated_analysis(sample_df, lambda episode: self._get_entropy_series(episode), file_name, title, num_comp) sample_df = pd.DataFrame(columns=[ 'trial_' + str(trial_num) for trial_num in range(self.trial_separation) ]) feature_seq = [] sample_detail_data = self.current_detail # random.sample(self.current_detail, NUMBER_OF_SAMPLE_SUBJECTS) for subject_index, detail_df in enumerate(sample_detail_data): for index, episode in enumerate( range(len(self.current_data[0]))[-SAMPLE_ACTION_SEQUENCES:] ): # extract last 10 episode action sequences action_sequence = list( map(int, (detail_df['action'] )[episode * self.trial_separation:(episode + 1) * self.trial_separation].tolist())) sample_df.loc[SAMPLE_ACTION_SEQUENCES * subject_index + index] = action_sequence if HEAD_AND_TAIL_SUBJECTS: feature_seq.append( self.human_data_df['Performance'].loc[subject_index]) else: feature_seq.append(subject_index) feature_series_func = lambda dummy_var: feature_seq self._aggregated_analysis( sample_df, feature_series_func, file_name, title, num_comp, head_subjects=NUMBER_OF_SAMPLE_SUBJECTS, tail_subjects=NUMBER_OF_TAIL_SUBJECTS, num_sequences=SAMPLE_ACTION_SEQUENCES, feature_label='Subject ID' if not HEAD_AND_TAIL_SUBJECTS else 'Negative Log Likelihood Performance', in_all_episodes=False, in_selected_episodes=False, simple_analysis=True)
def run(self): """ Run rotation. """ assert os.path.exists(self.options.in_dir) common.makedir(self.options.out_dir) files = self.read_directory(self.options.in_dir) for filepath in files: #added LC_NUMERIC=C os.system('LC_NUMERIC=C meshlabserver -i %s -o %s -s %s' % ( filepath, os.path.join(self.options.out_dir, ntpath.basename(filepath)), self.rotation_script ))
def run(self): """ Run the tool, i.e. scale all found OFF files. """ common.makedir(self.options.out_dir) if self.options.t_dir is not None: common.makedir(self.options.t_dir) files = self.get_in_files() if self.options.n_proc == 0: for filepath in files: self.run_file(filepath) else: with Pool(self.options.n_proc) as p: p.map(self.run_file, files)
def run(self): """ Run simplification. """ assert os.path.exists(self.options.in_dir) common.makedir(self.options.out_dir) files = self.read_directory(self.options.in_dir) print files for filepath in files[0:1]: print filepath command = '/Applications/meshlab.app/Contents/MacOS/meshlabserver -i %s -o %s -s %s' % ( filepath, os.path.join(self.options.out_dir, ntpath.basename(filepath)), self.simplification_script) print command os.system(command)
def main(args): if args.temp: buckets_dir = args.temp else: buckets_dir = os.path.join(args.model, 'buckets') makedir(buckets_dir) bucketlist_path = os.path.join(args.model, 'bucketlist') index_path = os.path.join(args.model, 'paths') # display paths logger.info("index path: %s", index_path) logger.info("bucketlist path: %s", bucketlist_path) logger.info("buckets path: %s", buckets_dir) with open(index_path) as f: reader = csv.reader(f) items = list(reader) # Tokenize logger.info("will tokenize %d files" % len(items)) if args.scanner: from scanner import Scanner tokenizer = Scanner.from_file(args.scanner) logger.info("using provided scanner: ", args.scanner) elif args.prager: tokenizer = PragerTokenizer(args.order, use_words=args.words) logger.info( "using Prager tokenization: order[{0}] use_words[{1}]".format( args.order, args.words)) else: tokenizer = NGramTokenizer(args.min_order, args.max_order) logger.info("using n-gram tokenizer: order {0}-{1}".format( args.min_order, args.max_order)) b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size) # output the paths to the buckets with open(bucketlist_path, 'w') as f: for d in b_dirs: f.write(d + '\n')
def run_render(self): """ Run rendering. """ assert os.path.exists(self.options.in_dir) common.makedir(self.options.depth_dir) files = self.read_directory(self.options.in_dir) timer = common.WallTimer() Rs = self.get_views() for filepath in files: timer.reset() mesh = common.Mesh.from_off(filepath) depths = self.render(mesh, Rs) depth_file = os.path.join(self.options.depth_dir, os.path.basename(filepath) + '.h5') common.write_hdf5(depth_file, np.array(depths)) print('[Data] wrote %s (%f seconds)' % (depth_file, timer.elapsed()))
def _compare_score_against_human_data(self, title): if not SOCRE_COMPARE: return makedir(RESULTS_FOLDER + 'Score_Summary/') file_name = lambda x: self.file_name('Score_Summary/' + x) summary_df = self.human_data_df.copy() # create a target for CCA target_df = pd.DataFrame() target_df['score'] = [df['score'].mean() for df in self.current_data] cca = CCA(n_components=1) cca.fit(summary_df, target_df) # combine them for PCA for column_id in ANALYSIS_EXTRA_COLUMNS: summary_df[column_id] = [ df[column_id].mean() for df in self.current_data ] pca = PCA(n_components=PCA_COMPONENTS) pca.fit(summary_df) with open(file_name('Score Statistics Summary ' + title), 'x') as f: self._write_pca_summary(pca, f) f.write('\nCCA:\n X weights:\n') f.write(' ' + ' '.join(map(str, cca.x_weights_))) f.write('\n Y weights\n') f.write(' ' + ' '.join(map(str, cca.y_weights_))) # generate historical CCA cca_trace_df = pd.DataFrame(columns=HUMAN_DATA_COLUMN) for index in range(self.current_data[0].shape[0])[3:]: target_df = pd.DataFrame() target_df['score'] = [ df['score'].loc[:index].mean() for df in self.current_data ] cca.fit(self.human_data_df, target_df) cca_trace_df.loc[index] = [abs(x[0]) for x in cca.x_weights_] cca_trace_df.plot(figsize=FIG_SIZE, grid=True, title='CCA progression summary ' + title) save_plt_figure(file_name('CCA progression summary ' + title))
def downloadGooglePicture(keyword, startpage, endpage): common.makedir(str(keyword)) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" } for startpage in range(int(startpage), int(endpage)): url = 'https://chartsapi.gdgdocs.org/search?tbm=isch&q=' + str( keyword) + '&ijn=' + str(startpage) # print(url) text = requests.get(url=url, headers=headers).content #解码 将二进制数据转换为str text = text.decode() # print(text) # json解析错误,所以用正则匹配出所有图片地址 pattern = re.compile(r',"ou":"(.*?)","ow"', re.S) pictureList = re.findall(pattern, text) # print(pictureList) for count in range(0, len(pictureList)): # 消除url编码问题:将=编码为\\u003d pictureList[count] = common.urlCode(pictureList[count]) # print(pictureList[count]) name = str(keyword) + '_' + str(startpage) + "_" + str(count) type = '.jpg' try: # 启动线程 mythread = threading.Thread(target=common.downloadPicture, args=(pictureList[count], name, type)) mythread.start() # 控制线程数目 if (threading.activeCount() >= 64): mythread.join() # 打印线程名 和线程数目 print('thread name:' + str(mythread.name)) print(str(threading.activeCount()) + 'actived thread') except: print('线程创建失败') startpage += 1
def run(self): """ Run the tool. """ common.makedir(self.options.out_dir) files = self.get_in_files() if self.options.mode == 'render': method = self.run_render elif self.options.mode == 'fuse': method = self.run_fuse elif self.options.mode == 'sample': method = self.run_sample else: print('Invalid model, choose render or fuse.') exit() if self.options.n_proc == 0: for filepath in files: method(filepath) else: with Pool(self.options.n_proc) as p: p.map(method, files)
def main(args): if args.temp: buckets_dir = args.temp else: buckets_dir = os.path.join(args.model, 'buckets') makedir(buckets_dir) bucketlist_path = os.path.join(args.model, 'bucketlist') index_path = os.path.join(args.model, 'paths') # display paths logger.info("index path: %s", index_path) logger.info("bucketlist path: %s", bucketlist_path) logger.info("buckets path: %s", buckets_dir) with open(index_path) as f: reader = csv.reader(f) items = list(reader) # Tokenize logger.info("will tokenize %d files" % len(items)) if args.scanner: from scanner import Scanner tokenizer = Scanner.from_file(args.scanner) logger.info("using provided scanner: ", args.scanner) elif args.prager: tokenizer = PragerTokenizer(args.order, use_words=args.words) logger.info("using Prager tokenization: order[{0}] use_words[{1}]".format(args.order, args.words)) else: tokenizer = NGramTokenizer(args.min_order,args.max_order) logger.info("using n-gram tokenizer: order {0}-{1}".format(args.min_order, args.max_order)) b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size) # output the paths to the buckets with open(bucketlist_path,'w') as f: for d in b_dirs: f.write(d+'\n')
def translate(source_sentences, phrase_table_fnames, weights, translation_type, permutations_per_sentence, derivations_count, output_dir): sentence_fsts_dir = os.path.join(output_dir, 'sentence_fsts') common.makedir(sentence_fsts_dir) if 'monotone' == translation_type: encode_sentences_to_fsts_monotone(source_sentences, sentence_fsts_dir) else: encode_sentences_to_fsts_lattice(source_sentences, permutations_per_sentence, weights, sentence_fsts_dir) phrase_table_fsts_dir = os.path.join(output_dir, 'phrase_table_fsts') common.makedir(phrase_table_fsts_dir) encode_phrase_tables_to_fsts(source_sentences, phrase_table_fnames, weights, phrase_table_fsts_dir) translation_fsts_dir = os.path.join(output_dir, 'translation_fsts') common.makedir(translation_fsts_dir) make_translation_fsts(sentence_fsts_dir, phrase_table_fsts_dir, translation_type, derivations_count, translation_fsts_dir) translations_dir = os.path.join(output_dir, 'translations') common.makedir(translations_dir) get_best_translations(translation_fsts_dir, translation_type, translations_dir)
help="use DOMAIN - can be specified multiple times (uses all domains found if not specified)") parser.add_argument("-l","--lang", metavar="LANG", action='append', help="use LANG - can be specified multiple times (uses all langs found if not specified)") parser.add_argument("--min_domain", type=int, default=MIN_DOMAIN, help="minimum number of domains a language must be present in" ) parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR") args = parser.parse_args() corpus_name = os.path.basename(args.corpus) if args.model: model_dir = args.model else: model_dir = os.path.join('.', corpus_name+'.model') makedir(model_dir) langs_path = os.path.join(model_dir, 'lang_index') domains_path = os.path.join(model_dir, 'domain_index') index_path = os.path.join(model_dir, 'paths') # display paths if not SILENT: print "corpus path:", args.corpus print "model path:", model_dir print "writing langs to:", langs_path print "writing domains to:", domains_path print "writing index to:", index_path indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion, langs = args.lang, domains = args.domain)
group.add_argument( "--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None) args = parser.parse_args() if args.sample_count and args.line: parser.error("sampling in line mode is not implemented") if args.temp: tmp_dir = args.temp else: tmp_dir = os.path.join(args.model, 'buckets') makedir(tmp_dir) # We generate a new directory at each invocation, otherwise we run the # risk of conflicting with a previous run without warning. buckets_dir = tempfile.mkdtemp(suffix='tokenize', dir=tmp_dir) bucketlist_path = args.output if args.output else os.path.join( args.model, 'bucketlist') index_path = os.path.join(args.model, 'paths') # display paths print "index path:", index_path print "bucketlist path:", bucketlist_path print "buckets path:", buckets_dir if args.line:
tr_type = sys.argv[6] perms_fname = None if 'monotone' == tr_type: out_dir = sys.argv[7] elif len(sys.argv) < 9 or 'lattice' != tr_type: print error sys.exit() else: perms_fname = sys.argv[7] out_dir = sys.argv[8] return sys.argv[1], sys.argv[2], int(sys.argv[3]),\ sys.argv[4], int(sys.argv[5]), tr_type, perms_fname, out_dir if __name__ == '__main__': if len(sys.argv) < 8: print error sys.exit() source_sentences_fname, phrase_tables_dir, sentences_count, weights_fname, derivations_count,\ translation_type, permutations_fname, output_dir = parse_arguments() source_sentences, phrase_table_fnames, weights, permutations_per_sentence =\ ir.read_input(source_sentences_fname, phrase_tables_dir, sentences_count, weights_fname, permutations_fname) # Create output folder common.makedir(output_dir) tp.translate(source_sentences, phrase_table_fnames, weights, translation_type, permutations_per_sentence, derivations_count, output_dir)
parser.add_argument("-s", "--scanner", metavar='SCANNER', help="use SCANNER for tokenizing") parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS) parser.add_argument("--max_order", type=int, help="highest n-gram order to use") parser.add_argument("--word", action='store_true', default=False, help="use 'word' tokenization (currently str.split)") parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE) parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets") parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR") args = parser.parse_args() if args.temp: buckets_dir = args.temp else: buckets_dir = os.path.join(args.model, 'buckets') makedir(buckets_dir) bucketlist_path = os.path.join(args.model, 'bucketlist') index_path = os.path.join(args.model, 'paths') # display paths print "index path:", index_path print "bucketlist path:", bucketlist_path print "buckets path:", buckets_dir with open(index_path) as f: reader = csv.reader(f) items = list(reader) if sum(map(bool,(args.scanner, args.max_order, args.word))) > 1: parser.error('can only specify one of --word, --scanner and --max_order')
group = parser.add_argument_group('sampling') group.add_argument("--sample_size", type=int, help="size of sample for sampling-based tokenization", default=140) group.add_argument("--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None) args = parser.parse_args() if args.sample_count and args.line: parser.error("sampling in line mode is not implemented") if args.temp: tmp_dir = args.temp else: tmp_dir = os.path.join(args.model, 'buckets') makedir(tmp_dir) # We generate a new directory at each invocation, otherwise we run the # risk of conflicting with a previous run without warning. buckets_dir = tempfile.mkdtemp(suffix='tokenize',dir=tmp_dir) bucketlist_path = args.output if args.output else os.path.join(args.model, 'bucketlist') index_path = os.path.join(args.model, 'paths') # display paths print "index path:", index_path print "bucketlist path:", bucketlist_path print "buckets path:", buckets_dir if args.line: print "treating each LINE as a document"
# Try to determine the set of features to consider if args.features: # Use a pre-determined feature list feat_path = args.features elif os.path.exists(m_path('DFfeats')): # Use LDfeats feat_path = m_path('DFfeats') else: raise ValueError("no suitable feature list") # Where temp files go if args.temp: buckets_dir = args.temp else: buckets_dir = m_path('buckets') makedir(buckets_dir) all_langs = set() pairs = [] for p in args.pairs: try: lang1, lang2 = p.split(',') except ValueError: # Did not unpack to two values parser.error("{0} is not a lang-pair".format(p)) all_langs.add(lang1) all_langs.add(lang2) pairs.append((lang1, lang2)) if args.output: makedir(args.output)
max_order = 4 min_domain = 1 model = None no_domain_ig = False proportion = 1.0 sample_count = None sample_size = 140 temp = None word = False if __name__ == "__main__": data_path = "../data" corpus_name = os.path.basename(data_path) model_dir = os.path.join('.', corpus_name + '.model') makedir(model_dir) #语料库index初始化 #输入:数据路径,最小分类,训练比例,语言,分类数,line print "开始进行索引语料库-index......" indexer = CorpusIndexer(data_path, min_domain=1, proportion=1.0, langs=None, domains=None, line_level=False) # 计算文件,语言和域之间的映射 lang_dist = indexer.dist_lang lang_index = indexer.lang_index lang_info = ' '.join( ("{0}({1})".format(k, lang_dist[v]) for k, v in lang_index.items()))
group.add_argument( "--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None) args = parser.parse_args() if args.sample_count and args.line: parser.error("sampling in line mode is not implemented") if args.temp: buckets_dir = args.temp else: buckets_dir = os.path.join(args.model, 'buckets') makedir(buckets_dir) bucketlist_path = args.output if args.output else os.path.join( args.model, 'bucketlist') index_path = os.path.join(args.model, 'paths') # display paths print "index path:", index_path print "bucketlist path:", bucketlist_path print "buckets path:", buckets_dir if args.line: print "treating each LINE as a document" with open(index_path) as f: reader = csv.reader(f)
) parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR") args = parser.parse_args() if args.df_feats and args.ld_feats: parser.error("--df_feats and --ld_feats are mutually exclusive") corpus_name = os.path.basename(args.corpus) if args.model: model_dir = args.model else: model_dir = os.path.join(".", corpus_name + ".model") makedir(model_dir) langs_path = os.path.join(model_dir, "lang_index") domains_path = os.path.join(model_dir, "domain_index") index_path = os.path.join(model_dir, "paths") # display paths print "corpus path:", args.corpus print "model path:", model_dir indexer = CorpusIndexer( args.corpus, min_domain=args.min_domain, proportion=args.proportion, langs=args.lang, domains=args.domain ) # Compute mappings between files, languages and domains lang_dist = indexer.dist_lang
type=bool, default=True, help='If printing logging information for scales of meshes.') return parser if __name__ == '__main__': parser = get_parser() options = parser.parse_args() scale_tools = Scale(options) fusion_tools = Fusion(options) assert os.path.exists(options.in_dir) common.makedir(options.scale_dir) common.makedir(options.depth_dir) common.makedir(options.out_dir) files_unfiltered = scale_tools.read_directory(options.in_dir) files = [file for file in files_unfiltered if '.off' in file] print('= Found %s OFFs in %s' % (len(files), options.in_dir)) print(files) timer = common.Timer() Rs = fusion_tools.get_views() for idx, filepath in enumerate(files): print('=== Processing %d/%d OFFs...' % (idx + 1, len(files))) off_file_out = os.path.join(options.out_dir, ntpath.basename(filepath)).replace(
args = parser.parse_args() if args.sample_count and args.line: parser.error("sampling in line mode is not implemented") if args.df_feats and args.ld_feats: parser.error("--df_feats and --ld_feats are mutually exclusive") corpus_name = os.path.basename(args.corpus) if args.model: model_dir = args.model else: model_dir = os.path.join('.', corpus_name+'.model') makedir(model_dir) # display paths print "corpus path:", args.corpus print "model path:", model_dir indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion, langs = args.lang, domains = args.domain, line_level=args.line) # Compute mappings between files, languages and domains lang_dist = indexer.dist_lang lang_index = indexer.lang_index lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items())) print "langs({0}): {1}".format(len(lang_dist), lang_info) domain_dist = indexer.dist_domain
def run_fuse(self): """ Run fusion. """ assert os.path.exists(self.options.depth_dir) common.makedir(self.options.out_dir) common.makedir('4_tsdf') files = self.read_directory(self.options.depth_dir) timer = common.Timer() Rs = self.get_views() for filepath in files: # As rendering might be slower, we wait for rendering to finish. # This allows to run rendering and fusing in parallel (more or less). depths = common.read_hdf5(filepath) timer.reset() tsdf = self.fusion(depths, Rs) tsdf = tsdf[0] vertices, triangles = libmcubes.marching_cubes(-tsdf, 0) # vertices, triangles, _, _ = measure.marching_cubes_lewiner(-tsdf, 0) print tsdf.shape np.save(os.path.join('4_tsdf', ntpath.basename(filepath)[:-3]).replace('.off', ''), -tsdf) vertices /= self.options.resolution vertices -= 0.5 off_file = os.path.join(self.options.out_dir, ntpath.basename(filepath)[:-3]) libmcubes.export_off(vertices, triangles, off_file) print('[Data] wrote %s (%f seconds)' % (off_file, timer.elapsed())) mesh = common.Mesh.from_off(off_file) s_t = scipy.io.loadmat(off_file.replace('2_watertight', '1_s_t').replace('.off', '.mat')) # scales_ori = (1./s_t['scales'][0][0], 1./s_t['scales'][0][1], 1./s_t['scales'][0][2]) # translation_ori = (-s_t['translation'][0][0], -s_t['translation'][0][1], -s_t['translation'][0][2]) sizes_ori = (s_t['sizes'][0][0], s_t['sizes'][0][1], s_t['sizes'][0][2]) # print scales, translation min, max = mesh.extents() total_min = np.min(np.array(min)) total_max = np.max(np.array(max)) # Set the center (although this should usually be the origin already). centers = ( (min[0] + max[0]) / 2, (min[1] + max[1]) / 2, (min[2] + max[2]) / 2 ) # Scales all dimensions equally. sizes = ( total_max - total_min, total_max - total_min, total_max - total_min ) translation = ( -centers[0], -centers[1], -centers[2] ) mesh.translate(translation) mesh.scale((sizes_ori[0]/sizes[0], sizes_ori[1]/sizes[1], sizes_ori[2]/sizes[2])) mesh.to_off(off_file)