def __init__(self, Data_dir, stage, ratio=(1, 0, 0), seed=1000): random.seed(seed) self.Data_dir = Data_dir Data_list0 = read_txt('./lookuptxt/', 'ADNI_1.5T_GAN_NL.txt') #Data_list1 = read_txt('./lookuptxt/', 'ADNI_1.5T_GAN_MCI.txt') Data_list2 = read_txt('./lookuptxt/', 'ADNI_1.5T_GAN_AD.txt') Data_list3 = read_txt('./lookuptxt/', 'ADNI_3T_NL.txt') #Data_list4 = read_txt('./lookuptxt/', 'ADNI_3T_MCI.txt') Data_list5 = read_txt('./lookuptxt/', 'ADNI_3T_AD.txt') self.Data_list_lo = Data_list0 + Data_list2 self.Data_list_hi = Data_list3 + Data_list5 self.Label_list = [0] * len(Data_list0) + [1] * len(Data_list2) self.stage = stage self.length = len(self.Data_list_lo) self.patchsampler = PatchGenerator(patch_size=47) idxs = list(range(self.length)) random.shuffle(idxs) split1, split2 = int(self.length * ratio[0]), int( self.length * (ratio[0] + ratio[1])) if self.stage == 'train_p': self.index_list = idxs[:split1] elif self.stage == 'train_w': self.index_list = idxs[:split1] elif self.stage == 'valid': self.index_list = idxs[split1:split2] elif self.stage == 'test': self.index_list = idxs[split2:] elif self.stage == 'all': self.index_list = idxs else: raise ValueError('invalid stage setting')
def __init__(self, Data_dir, class1, class2, stage, ratio=(0.6, 0.2, 0.2), seed=1000, shuffle=True): random.seed(seed) self.Data_dir = Data_dir Data_list0 = read_txt('./lookuptxt/', class1 + '.txt') Data_list1 = read_txt('./lookuptxt/', class2 + '.txt') self.Data_list = Data_list0 + Data_list1 self.Label_list = [0] * len(Data_list0) + [1] * len(Data_list1) self.stage = stage self.length = len(self.Data_list) idxs = list(range(self.length)) if shuffle: random.shuffle(idxs) split1, split2 = int(self.length * ratio[0]), int( self.length * (ratio[0] + ratio[1])) if self.stage == 'train': self.index_list = idxs[:split1] elif self.stage == 'valid': self.index_list = idxs[split1:split2] elif self.stage == 'test': self.index_list = idxs[split2:] elif self.stage == 'all': self.index_list = idxs else: raise ValueError('invalid stage setting')
def load_test_data(img_shape=(224, 224)): class_map = {'normal': 0, 'pneumonia': 1, 'COVID-19': 2} dataset_path = Paths.DATASET_BASE_PATH + 'data/test/' csv_content = utils.read_txt(Paths.DATASET_BASE_PATH + 'COVID-Net/test_COVIDx.txt') _x_test_paths = [] _y_test = [] for c in csv_content: c = c.split(' ') _y_test.append(class_map[c[-1].replace('\n', '')]) _x_test_paths.append(dataset_path + c[-2]) dataset_path = '/media/share/pedro/2020-Covid/data/' csv_content = utils.read_txt('/media/share/pedro/2020-Covid/data/test.txt') for c in csv_content: _y_test.append(class_map['COVID-19']) _x_test_paths.append(dataset_path + c.replace('\n', '')) _y_test = np.asarray(_y_test) _x_test = utils.load_images(_x_test_paths, img_shape) / 255. return _x_test, _y_test
def get_txt_content(ans_txt_path_list: list, pending_txt_dir: str) -> list: txt_content = [] for a_txt_path in ans_txt_path_list: txt_name = os.path.basename(a_txt_path) p_txt_path = os.path.join(pending_txt_dir, txt_name) a_txt_content = utils.read_txt(a_txt_path) if os.path.exists(p_txt_path): p_txt_content = utils.read_txt(p_txt_path) txt_content.append((a_txt_content, p_txt_content)) else: txt_content.append((a_txt_content, [])) st.warning("{}不存在,请检查".format(a_txt_path)) return txt_content
def load_and_cache_examples(args, tokenizer, logger, mode="train"): """SemEval2010Task8 does'not have dev set""" assert mode in ["train", "test", "dev"] if not os.path.exists(args.data_cache_dir): os.mkdir(args.data_cache_dir) cached_examples_file = os.path.join( args.data_cache_dir, "cached_{}_{}_{}_{}".format(args.dataset, mode, args.entity_position_encoding, str(args.max_seq_length)), ) if os.path.exists(cached_examples_file): logger.info("Loading features from cached file %s", cached_examples_file) examples = torch.load(cached_examples_file) else: logger.info("Creating features for %s %s set" % (args.dataset, mode)) if args.dataset == 'kbp37': _, train_sentences, train_relations = read_txt(os.path.join(KBP37RawPath, "train.txt")) _, dev_sentences, dev_relations = read_txt(os.path.join(KBP37RawPath, "dev.txt")) _, test_sentences, test_relations = read_txt(os.path.join(KBP37RawPath, "test.txt")) if not args.kbp37_split_dev: train_sentences.extend(dev_sentences) train_relations.extend(dev_relations) else: _, train_sentences, train_relations = read_txt(os.path.join(SemEval2010Task8RawPath, "train.txt")) _, test_sentences, test_relations = read_txt(os.path.join(SemEval2010Task8RawPath, "test.txt")) relation2id_path = KBP37Relation2IdPath if args.dataset == "kbp37" else SemEval2010Relation2IdPath if os.path.exists(relation2id_path): with open(relation2id_path, 'r', encoding='utf8') as f: relation2id = json.load(f) else: relation2id, _ = static_relations(train_relations) with open(relation2id_path, 'w', encoding='utf8') as f: json.dump(relation2id, f) if mode == 'train': sentences, relations = train_sentences, train_relations elif mode == 'test': sentences, relations = test_sentences, test_relations else: if args.dataset == 'kbp37': sentences, relations = dev_sentences, dev_relations else: raise ValueError("SemEval2010Task8 does'not have dev set!") examples = create_examples_for_xlnet(sentences, relations, tokenizer, relation2id, True if args.entity_position_encoding=="entity_tag" else False, True if args.entity_position_encoding == "token_type_ids" else False, args.max_seq_length) torch.save(examples, cached_examples_file) return examples
def _create_dataset(input_dir, filenames, output_path): count = 0 writer = tf.python_io.TFRecordWriter(output_path+'.cache') random.shuffle(filenames) for i, filename in enumerate(filenames): wave_path = input_dir + filename[0] txt_path = input_dir + filename[1] stem = os.path.splitext(os.path.split(filename[0])[-1])[0] wave = utils.read_wave(wave_path) text = utils.read_txt(txt_path) if len(wave) >= len(text): data = tf.train.Example(features=tf.train.Features(feature={ 'uid': tf.train.Feature(bytes_list=tf.train.BytesList(value=[stem.encode('utf-8')])), 'audio/data': tf.train.Feature(float_list=tf.train.FloatList(value=wave.reshape([-1]).tolist())), 'audio/shape': tf.train.Feature(int64_list=tf.train.Int64List(value=wave.shape)), 'text': tf.train.Feature(int64_list=tf.train.Int64List(value=text)), })) writer.write(data.SerializeToString()) else: glog.error("length of label(%d) is greater than feature(%d) at %s." % (len(text), len(wave), stem)) count = i + 1 if count % 50 == 0: glog.info('processed %d/%d files.' % (count, len(filenames))) if count % 1000 != 0: glog.info('processed %d/%d files.' % (count, len(filenames))) if os.path.exists(output_path): os.remove(output_path) if os.path.exists(output_path+'.cache'): os.renames(output_path+'.cache', output_path)
def run(args): doc = read_txt(args.path_to_doc) doc_tokens = [ process_text(entry, lower=not args.cased, remove_stopwords=args.remove_stopwords, remove_punctuation=args.remove_punctuation) for entry in doc ] all_tokens = [] for entry_tokens in doc_tokens: all_tokens += entry_tokens rare_tokens, selected_tokens = get_rare_tokens(all_tokens, args.min_freq, args.max_tokens, return_non_rare=True) if args.remove_rare: doc_tokens = [ filter_tokens(entry_tokens, set(rare_tokens)) for entry_tokens in doc_tokens ] gu = GloVeUtility(args.path_to_glove) vectorizer = CountVectorizer(ngram_range=(args.ngram_lower, args.ngram_upper), vocabulary=selected_tokens) count_vector = vectorizer.fit_transform( [" ".join(entry_tokens) for entry_tokens in doc_tokens]) csr_mat = count_vector.T * count_vector csr_mat.setdiag(0) cooccur_ar = csr_mat.toarray() mittens_model = Mittens(n=gu.d, max_iter=args.iter) embeddings = mittens_model.fit(cooccur_ar, vocab=selected_tokens, initial_embedding_dict=gu.vector_dict) filename = args.path_to_glove.split(os.path.sep)[-1] os.makedirs(args.output, exist_ok=True) embeddings_dict = dict(zip(selected_tokens, embeddings)) progress_bar.std_print("\nTrained on {} tokens.".format( len(embeddings_dict))) if args.save_new_only: savepath = os.path.join(args.output, "new_" + filename) embeddings_list = [ " ".join([key] + [str(val) for val in embeddings_dict[key]]) for key in embeddings_dict ] write_txt(savepath, embeddings_list) else: savepath = os.path.join(args.output, filename) gu.add_replace_vectors(embeddings_dict) gu.save_vectors(savepath)
def __init__(self, root_dir, txt_COVID, txt_NonCOVID, transform=None): """ Args: txt_path (string): Path to the txt file with annotations. root_dir (string): Directory with all the images. transform (callable, optional): Optional transform to be applied on a sample. File structure: - root_dir - CT_COVID - img1.png - img2.png - ...... - CT_NonCOVID - img1.png - img2.png - ...... """ self.root_dir = root_dir self.txt_path = [txt_COVID, txt_NonCOVID] self.classes = ['CT_COVID', 'CT_NonCOVID'] self.num_cls = len(self.classes) self.img_list = [] for c in range(self.num_cls): cls_list = [[ os.path.join(self.root_dir, self.classes[c], item), c ] for item in read_txt(self.txt_path[c])] self.img_list += cls_list self.transform = transform print('samples = ', len(self.img_list))
def __init__(self, logging_groups=['Default', 'Cortex']): Cortex.__init__(self, 'SOUL', logging_groups) # The following is parsing the defines.txt to get the app specific data defines = utils.parse_defines( utils.read_txt("../database/cortex/soul.txt")) self.delay = eval(defines['delay'])
def get_index(filename): if os.path.exists(filename): movie_id, total = map(int, utils.read_txt(filename).split('\n')[0].split(',')) else: movie_id, total = 129406, 0 print(movie_id, total) return [movie_id, total]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--feats", default=None, required=True, help="name of the list of hdf5 files") parser.add_argument("--stats", default=None, required=True, help="filename of hdf5 format") args = parser.parse_args() # read list and define scaler filenames = read_txt(args.feats) scaler = StandardScaler() print("number of training utterances =", len(filenames)) # process over all of data for filename in filenames: feat = read_hdf5(filename, "/feat_org") scaler.partial_fit(feat[:, 1:]) # add uv term mean = np.zeros((feat.shape[1])) scale = np.ones((feat.shape[1])) mean[1:] = scaler.mean_ scale[1:] = scaler.scale_ # write to hdf5 write_hdf5(args.stats, "/mean", mean) write_hdf5(args.stats, "/scale", scale)
def Nonlinear_Test(models): print("Load the validation data...") start_time = time.time() val_imgs, val_idxs = load_val_data(data_dir) print("{:.4f} seconds".format(time.time() - start_time)) del val_imgs print("Extract the image features...") val_features = np.load('./val_bow.npy') print('Test the classifiers...') accuracy = 0 for class_name in category: target_idxs = np.array([ read_txt(os.path.join(data_dir, '{}_val.txt'.format(class_name))) ]) target_labels = get_labels(val_idxs, target_idxs) val_accuracy = models[class_name].score(val_features, target_labels) print('{} Classifier validation accuracy: {:.4f}'.format( class_name, val_accuracy)) accuracy += val_accuracy del val_features, target_idxs, target_labels print('Average validation accuracy: {:.4f}'.format(accuracy / len(category)))
def Nonlinear_Trainer(): print("Load the training data...") start_time = time.time() train_imgs, train_idxs = load_train_data(data_dir) del train_imgs print("{:.4f} seconds".format(time.time() - start_time)) print("Extract the image features...") train_features = np.load('./train_bow.npy') print('Train the classifiers...') accuracy = 0 models = {} for class_name in category: target_idxs = np.array([ read_txt(os.path.join(data_dir, '{}_train.txt'.format(class_name))) ]) target_labels = get_labels(train_idxs, target_idxs) models[class_name] = nonlinear_classifier(train_features, target_labels) train_accuracy = models[class_name].score(train_features, target_labels) print('{} zClassifier train accuracy: {:.4f}'.format( class_name, train_accuracy)) accuracy += train_accuracy print('Average train accuracy: {:.4f}'.format(accuracy / len(category))) del train_features, target_labels, target_idxs return models
def __init__(self, config, mode): """ Args: txt_path (string): Path to the txt file with annotations. root_dir (string): Directory with all the images. File structure: - root_dir - CT_COVID - img1.png - img2.png - ...... - CT_NonCOVID - img1.png - img2.png - ...... """ self.config = config self.root = self.config.dataset.input_data if mode == 'train': self.txt_path = [train_txt_COVID, train_txt_NonCOVID] elif mode == 'val': self.txt_path = [val_txt_COVID, val_txt_NonCOVID] elif mode == 'test': self.txt_path = [test_txt_COVID, test_txt_NonCOVID] self.class_dict = {'CT_COVID': 0, 'CT_NonCOVID': 1} self.num_cls = len(self.class_dict) self.img_list = [] for c in range(self.num_cls): cls_list = [[os.path.join(self.root, self.class_dict[c], item), c] for item in read_txt(self.txt_path[c])] self.img_list += cls_list normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transformer = transforms.Compose([ transforms.Resize(256), transforms.RandomResizedCrop((224), scale=(0.8, 1.2)), transforms.RandomRotation(15), transforms.RandomHorizontalFlip(p=0.1), transforms.RandomVerticalFlip(p=0.1), transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2), transforms.ToTensor(), normalize ]) val_transformer = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]) if mode == 'train': self.transform = train_transformer else: self.transform = val_transformer print('samples = ', len(self.img_list))
def load_documents(): txt_list = [] for document in sys.argv[2:]: txt = read_txt(document) txt_list.append(txt) dic = {'filename': sys.argv[2:], 'raw_text': txt_list} return pd.DataFrame.from_dict(dic)
def copy_covid_dataset(): output_path = Paths.DATASET_BASE_PATH + 'data/augmented-orig/' dataset_path = Paths.DATASET_BASE_PATH + 'data/train/' for c in utils.read_txt(Paths.DATASET_BASE_PATH + 'COVID-Net/train_COVIDx.txt'): c = c.split(' ') if 'COVID-19' in c[-1]: img = cv2.imread(dataset_path + c[-2], cv2.IMREAD_COLOR) cv2.imwrite(output_path + c[-2], img) # write png image
def get_index(filename): if os.path.exists(filename): movie_id, total = map( int, utils.read_txt(filename).split('\n')[0].split(',')) else: movie_id, total = 129406, 0 print(movie_id, total) return [movie_id, total]
def check_if_all_files_exist(data_type): csv_content = utils.read_txt(Paths.DATASET_BASE_PATH + 'COVID-Net/{}_COVIDx.txt'.format(data_type)) imgs_path = Paths.DATASET_BASE_PATH + 'data/{}/'.format(data_type) for c in csv_content: img_path = c.split(' ')[-2] if not os.path.exists(imgs_path + img_path): print('The following image was not found [{}].'.format(img_path)) shutil.move(imgs_path.replace('/test/', '/train/') + img_path, imgs_path + img_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--feats", default=None, required=True, type=str, help="name of the list of hdf5 files") parser.add_argument("--stats", default=None, required=True, type=str, help="filename of hdf5 format") parser.add_argument("--feature_type", default="world", choices=["world", "melspc", "mcep"], type=str, help="feature type") parser.add_argument("--verbose", default=1, type=int, help="log message level") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warn("logging is disabled.") # show argmument for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # read file list file_list = read_txt(args.feats) logging.info("number of utterances = %d" % len(file_list)) # calculate statistics calc_stats(file_list, args)
def __init__(self, screen, logging_groups=['Default', 'Cortex']): Cortex.__init__(self, 'Collector', logging_groups) # Current collection platform self.platform = GroupMe_Web(screen, logging_groups) # The following is parsing the defines.txt to get the app specific data defines = utils.parse_defines( utils.read_txt("../database/cortex/collector.txt")) self.alias_dict = eval(defines['member_alias'])
def __init__(self, mode, root_dir, txt_COVID, txt_NonCOVID, transform=None): """ Args: txt_path (string): Path to the txt file with annotations. root_dir (string): Directory with all the images. transform (callable, optional): Optional transform to be applied on a sample. File structure: - root_dir - CT_COVID - img1.png - img2.png - ...... - CT_NonCOVID - img1.png - img2.png - ...... """ self.root_dir = root_dir self.txt_path = [txt_COVID, txt_NonCOVID] self.classes = ['CT_COVID', 'CT_NonCOVID'] self.num_cls = len(self.classes) self.img_list = [] for c in range(self.num_cls): cls_list = [[ os.path.join(self.root_dir, self.classes[c], item), c ] for item in read_txt(self.txt_path[c])] self.img_list += cls_list normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transformer = transforms.Compose([ transforms.Resize(256), transforms.RandomResizedCrop((224), scale=(0.5, 1.0)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) val_transformer = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]) if mode == 'train': self.transform = train_transformer else: self.transform = val_transformer print('samples = ', len(self.img_list))
def main(): # parser arguments args = _get_arguments() # set log level if args.verbose == 1: logging.basicConfig(level=logging.INFO, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig(level=logging.WARN, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warn("logging is disabled.") # show argmument for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # check directory existence if not os.path.exists(os.path.dirname(args.writedir)): os.makedirs(os.path.dirname(args.writedir)) # get file list if os.path.isdir(args.feats): feat_list = sorted(find_files(args.feats, "*.%s" % args.feature_format)) elif os.path.isfile(args.feats): feat_list = read_txt(args.feats) else: logging.error("--feats should be directory or list.") sys.exit(1) feat_ids = [os.path.basename(f).replace(".%s" % args.feature_format, "") for f in feat_list] logging.info("number of utterances = %d" % len(feat_ids)) # divie list feat_ids = np.array_split(feat_ids, args.n_jobs) feat_ids = [f_ids.tolist() for f_ids in feat_ids] # multi processing processes = [] # for f in file_lists: for f in feat_ids: p = mp.Process(target=noise_shaping, args=(f, args,)) p.start() processes.append(p) # wait for all process for p in processes: p.join()
def import_call(self, e): if setting_fftool.has_query: utils.showinfo("有任务正在执行,请稍后") return tup = tuple([]) ft = self.file_types ft_tup = self.file_types_tup if e.widget == self.import_btn: tup = filedialog.askopenfilenames( filetypes=ft, title='导入文件', initialdir=setting_fftool.last_folder) elif e.widget == self.import_list_btn: if os.path.exists(setting_fftool.list_file): arr = utils.read_txt(setting_fftool.list_file) new_arr = [] for f in arr: if os.path.exists(f): new_arr.append(f) if not len(new_arr): utils.showinfo('txt中的地址都不正确' + setting_fftool.list_file) return tup = tuple(new_arr) elif e.widget == self.import_dir_btn: folder = filedialog.askdirectory( title='选择目录', initialdir=setting_fftool.last_folder) if folder: folder = utils.pathlib_path(folder) setting_fftool.last_folder = folder arr = [] new_arr = [] # 获得目录下所有文件 utils.list_dir(folder, arr) # 过滤出指定格式的文件 for f in arr: suffix = str(Path(f).suffix) for f_type in ft_tup: if suffix == f_type: new_arr.append(f) break tup = tuple(new_arr) if len(tup): tup = utils.pathlib_path_tup(tup, True) self.tree.set_list(list(tup)) # self.start.set_state(True) # self.clear_query() setting_fftool.last_folder = utils.pathlib_parent(tup[0])
def __getitem__(self, idx): if self.mode == 'train': filenames = self.train_filenames[idx] else: filenames = self.test_filenames[idx] wave_path = self.cfg.dataset + filenames[0] txt_path = self.cfg.dataset + filenames[1] try: text_tmp = utils.read_txt(txt_path) # list wave_tmp = utils.read_wave(wave_path) # numpy except OSError: print(txt_path) print(wave_path) return self.__getitem__(0) wave_tmp = torch.from_numpy(wave_tmp) wave = torch.zeros([40, self.max_wave ]) # 512 may be too short, if error,fix it length_wave = wave_tmp.shape[1] # print(length_wave) wave[:, :length_wave] = wave_tmp # print(txt_path) while 27 in text_tmp: text_tmp.remove(27) length_text = len(text_tmp) text_tmp = torch.tensor(text_tmp) text = torch.zeros([self.max_text ]) # 256 may be too short, fix it, if error text[:length_text] = text_tmp name = filenames[0].split('/')[-1] if length_text >= length_wave: sample = { 'name': name, 'wave': torch.zeros([40, self.max_wave], dtype=torch.float), 'text': torch.zeros([self.max_text], dtype=torch.float), 'length_wave': self.max_wave, 'length_text': self.max_text } else: sample = { 'name': name, 'wave': wave, 'text': text, 'length_wave': length_wave, 'length_text': length_text } return sample
def prepare_adjoint(config): # mkdir outputbase outputbase = config["outputdir"] clean_outputdir(outputbase) eventlist_file = config["eventlist"] eventlist = read_txt(eventlist_file) # split the eventlist and dump into separate files nevents_per_job = config["nevents_per_job"] cmtlist_per_job = split_job(eventlist, nevents_per_job) print("-"*10 + "\nJob list:\n%s" % cmtlist_per_job) print("="*20 + "\nPreparing jobs...") for job_id, cmtlist in cmtlist_per_job.iteritems(): prepare_one_job(job_id, cmtlist, config)
def find_end_curly_bracket(file_path, start_line, end_line): content = utils.read_txt(file_path) current_line = end_line - 1 start_line = start_line - 1 tag = False while (current_line >= start_line): if len(content[current_line]) > 0 and content[current_line][0] == '}': tag = True break current_line = current_line - 1 if tag: return current_line + 1 raise Exception( "ERROR: Cannot find the last curly bracket within [%d, %d]" % (start_line, end_line))
def load_url_by_file(_fileNum) -> List[Tuple[str, str]]: """ File로 부터 신문사들 URL가져옮 :param _fileNum: 뉴스언론사 index :return: zip(naver url, origin url) """ urldatas = read_txt(DATA_DIR + "{}.txt".format(_fileNum)) # naver @@@ origin # (naver, origin) urlList = list(zip( [url.split('@@@')[NAVER_IDX] for url in urldatas], [url.split('@@@')[ORIGIN_IDX] for url in urldatas] )) log('s', "{}.txt 파일에서 {}개 불러옴... Ex) {} ".format(_fileNum, len(urlList), urlList[0][ORIGIN_IDX])) return urlList
def main(): # parser arguments args = _get_arguments() # set log level if args.verbose == 1: logging.basicConfig(level=logging.INFO, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig(level=logging.WARN, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warn("logging is disabled.") # show argmument for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # read list if os.path.isdir(args.waveforms): file_list = sorted(find_files(args.waveforms, "*.wav")) else: file_list = read_txt(args.waveforms) logging.info("number of utterances = %d" % len(file_list)) wav_set = 'wav_%s_%s' % (args.feature_format, args.wavtype) # create file folders filepath_create(file_list, wav_set) # divide list file_lists = np.array_split(file_list, args.n_jobs) file_lists = [f_list.tolist() for f_list in file_lists] # multi processing processes = [] # for f in file_lists: for f in file_lists: p = mp.Process(target=noise_shaping, args=(f, wav_set, args,)) p.start() processes.append(p) # wait for all process for p in processes: p.join()
def main(): if (len(sys.argv)) != 3: print('Usage: python3 crawl_cnki.py start end') return else: start, end = int(sys.argv[1]), int(sys.argv[2]) start_time = time.time() print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # 将excel文件中的期刊名转存为txt文件,并读取出范围为start到end的期刊名 ori_src, new_src = './待爬取数据.xlsx', './journals.txt' utils.excel2txt(ori_src, new_src) journals = utils.read_txt(new_src, start, end) # 创建用于保存输出结果的目录 output_dir = './publish_numbers' if not os.path.exists(output_dir): os.mkdir(output_dir) start_year, end_year = 2012, 2020 cnt, total = 0, end - start + 1 succeed = failed = skipped = 0 for i in range(end - start + 1): cnt += 1 output_file = output_dir + '/' + journals[i] + '.xlsx' if not Path(output_file).is_file(): if start_crawl(journals[i], start_year, end_year, output_file): succeed += 1 else: failed += 1 print( 'Progress: {}/{}, succeed: {}, failed: {}, skipped: {}, used time: {}' .format(cnt, total, succeed, failed, skipped, time.time() - start_time)) else: skipped += 1 print( 'Finished crawl. Total succeed: {}, total failed: {}, total skipped: {}, total used time: {}' .format(succeed, failed, skipped, time.time() - start_time))
def prepare_ami_mdm(ami_mdm_location, audio_path, text_path, lists_path, processes): for f in ['dev', 'test', 'train']: dst_list = os.path.join(lists_path, f"ami-mdm-{f}.lst") dst_text = os.path.join(text_path, f"ami-mdm-{f}.txt") if not os.path.exists(dst_list): with Pool(processes) as p: to_list = partial(ami_mdm_to_list, audio_path, ami_mdm_location) rows = read_txt(os.path.join(ami_mdm_location, f)) samples = list(tqdm( p.imap(to_list, rows), total=len(rows), )) with open(dst_list, "w") as list_f: list_f.writelines(samples) with open(dst_list, "r") as list_f, open(dst_text, "w") as text_f: for line in list_f: text_f.write(" ".join(line.strip().split(" ")[3:]) + "\n") else: print(f"{dst_list} exists, doing verify") new_list = [] with open(dst_list, "r") as list_f: for line in list_f: filename = line.split(" ")[1] text = " ".join(line.strip().split(" ")[3:]) params = " ".join(line.strip().split(" ")[:3]) text = remove_punct(text) line = f"{params} {text}\n" if not os.path.exists(filename) or len( text) < 2 or not alpha.match(text): print( f"{filename} does not exists or text is empty, text: {text}" ) else: new_list.append(line) with open(dst_list, "w") as list_f: list_f.writelines(new_list) print("Prepared AMI MDM8", flush=True)
def read_category(category): filename_list = [] txt_list = [] txt_len_list = [] file_list = os.listdir(category) for file in file_list: filename = os.path.join(category, file) txt = read_txt(filename) filename_list.append(filename.replace('/', '_')) txt_list.append(txt) txt_len_list.append(len(txt)) dic = { 'category': [category] * len(filename_list), 'filename': filename_list, 'raw_text': txt_list, 'raw_text_lenght': txt_len_list } return pd.DataFrame.from_dict(dic)
def copy_missing_files(data_type): output_path = Paths.DATASET_BASE_PATH + 'data/{}/'.format(data_type) train_dataset_path = [ Paths.DATASET_BASE_PATH + 'covid-chestxray-dataset/images/', Paths.DATASET_BASE_PATH + 'rsna-pneumonia-detection-challenge/stage_2_{}_images/'.format(data_type) ] csv_content = utils.read_txt(Paths.DATASET_BASE_PATH + 'COVID-Net/{}_COVIDx.txt'.format(data_type)) _x_train_paths = [] for c in csv_content: full_path = None img_path = c.split(' ')[-2] if not img_path.endswith('.dcm'): if os.path.exists(train_dataset_path[0] + img_path): full_path = train_dataset_path[0] + img_path elif os.path.exists(train_dataset_path[1] + img_path): full_path = train_dataset_path[1] + img_path if full_path is not None: img = cv2.imread(full_path) cv2.imwrite(output_path + img_path, img) # write png image
def prepare_proc_obsd(config): # mkdir outputbase outputbase = config["outputdir"] clean_outputdir(outputbase) # copy param files taglist = config["taglist"] copy_param_files(taglist, config["paramdir"], outputbase) # copy path files eventlist_file = config["eventlist"] eventlist = read_txt(eventlist_file) if len(eventlist) == 0: raise ValueError("No events found in file:" % eventlist_file) copy_path_files(eventlist, taglist, config["pathdir"], outputbase) # split the eventlist and dump into separate files nevents_per_job = config["nevents_per_job"] cmtlist_per_job = split_job(eventlist, nevents_per_job) dump_cmtlist(cmtlist_per_job, outputbase) # prepare job scripts prepare_job_scripts(cmtlist_per_job, config)
FILE_VIDEO = "/home/jcruz/Desktop/BIG_DATA/fish_videos/ana_faustino/Tu15_A__Cond1_side_2.avi" START_T = 7 * 60 * 1000 (X_SCALE, X_TRANS) = (21.88392008, 271.97335871) (Y_SCALE, Y_TRANS) = (-21.9054242, 407.59388039) def on_mouse(event, x, y, flags, k): if event == 1: print "MOUSE:", x, y, flags # # # data = utils.read_txt(FILE_DATA) cap = cv2.VideoCapture(FILE_VIDEO) (x, y) = (10.5128689995, -7.9942489268) cv2.namedWindow("x") cv2.setMouseCallback("x", on_mouse) i = 0 fpre = None while True: tf = int(cap.get(cv2.cv.CV_CAP_PROP_POS_MSEC)) td = int(data[i][0] * 1000.0)
files = utils.list_excel(DATA_DIR) for name_xls in files: print name_xls # get the names of the result files name_txt = name_xls.replace(".xlsx", ".txt") name_sub = name_xls.replace(".xlsx", ".sub") name_rep = name_xls.replace(".xlsx", ".report") # convert from Excel to text if FORCE_XLS_TO_TXT or (not os.path.isfile(name_txt)): utils.excel_to_txt(name_xls, name_txt) # get the data from the text file data = np.array(utils.read_txt(name_txt)) # round the data to clean um some noise data = np.round(np.array(utils.read_txt(name_txt)), 2) # trim the initial and final NaN values data = compute.trim(data, 1) data = compute.trim(data, 2) # interpolate some NaN values compute.interpolate(data, 1) compute.interpolate(data, 2) # smooth the position data data_smooth = data data_smooth[:, 1] = compute.smooth(data[:, 1], win_size=25)
for (key, files) in fpairs.items(): if len(files) != 2: print "Missing file ('top' or 'side') in pair for assay '%s'" % key continue print "*****\n%s\n*****" % key name_sub = "%s/%s.sub" % (DATA_DIR, key) name_rep = "%s/%s.report" % (DATA_DIR, key) name_fig1 = "%s/%s_fig1.%s" % (DATA_DIR, key, FIG_FORMAT) name_fig2 = "%s/%s_fig2.%s" % (DATA_DIR, key, FIG_FORMAT) (trim_start, trim_end) = (None, None) for ftxt in files: # round the data to clean um some noise data = np.round(np.array(utils.read_txt(ftxt)), 2) # trim the initial and final NaN values (start, end) = compute.trim3(data) trim_start = start if (trim_start < start or trim_start is None) else trim_start trim_end = end if (trim_end > end or trim_end is None) else trim_end # interpolate some NaN values compute.interpolate3(data, start, end) if "_side" in ftxt: data_s = data elif "_top" in ftxt: data_t = data