def get_accounts_from_file(): if os.path.isfile(CONFIG_ACCOUNTS_FILE): file_content = read_file(CONFIG_ACCOUNTS_FILE) else: file_content = read_file(CONFIG_DEFAULT_ACCOUNTS_FILE) write_file(CONFIG_ACCOUNTS_FILE, file_content) return ast.literal_eval(file_content)
def get_wllr_ranking(domain): ranking = {} file_path = 'WLLR/{}.positive.wllr'.format(domain) lines = read_file(file_path).split('\n') lines.remove('') ranking['positive'] = [line.split('\t')[0] for line in lines] file_path = 'WLLR/{}.negative.wllr'.format(domain) lines = read_file(file_path).split('\n') lines.remove('') ranking['negative'] = [line.split('\t')[0] for line in lines] return ranking
def main(): # read file observations, states = parse(train_filename) train_sentences, train_labels = read_file(train_filename) val_sentences, val_labels = read_file(val_filename) test_sentences, original_sentences, test_labels = read_file(test_filename, test=True) # preprocess token_mapping = get_token_mapping(observations) X_train = prepare_inputs(token_mapping, w2v_W, w2v_U, train_sentences) state_mapping = {state: i for i, state in enumerate(states)} y_train = prepare_labels(state_mapping, train_labels) X_val = prepare_inputs(token_mapping, w2v_W, w2v_U, val_sentences) y_val = prepare_labels(state_mapping, val_labels) X_test = prepare_inputs(token_mapping, w2v_W, w2v_U, test_sentences) # train model model = init_vars(input_size=300, output_size=len(state_mapping), n_hidden=n_hidden) X_flat, y_flat = [], [] for sentence in X_train: X_flat.extend(sentence) X_flat = np.asarray(X_flat) for sentence in y_train: y_flat.extend(sentence) y_flat = np.asarray(y_flat) checkpoints, losses, scores = train(model, X_flat, y_flat, X_val, X_test, state_mapping, val_sentences, original_sentences, n_epochs, batch_size) f_entity = [tup[0][-1] for tup in scores] f_type = [tup[1][-1] for tup in scores] print('Entity:', (np.argmax(f_entity), np.max(f_entity))) print(f_entity) print('Entity type:', (np.argmax(f_type), np.max(f_type))) print(f_type) averaged = (np.array(f_type) + np.array(f_entity)) / 2 print("Average:", (np.argmax(averaged), np.max(averaged))) print(averaged)
def parse_text(text_file, delimiter, columns, header, no_array, identifiers): """Parse text file into Category and/or Variable fields.""" try: text_file, field_name = text_file.split('=') except ValueError: field_name = False data = file_io.read_file(text_file) lines = data.split('\n') if delimiter == 'whitespace': delimit = re.compile(r'\s+') else: delimit = re.compile(r"%s" % delimiter) columns = columns.split(',') if header: header_row = lines.pop(0).replace('"', '') columns = parse_header_row(delimiter, header_row, columns) cols, headers, types, width = map_fields(delimit, lines[0].replace('"', ''), columns) rows, id_rows, array = parse_rows(delimit, lines, width, no_array, cols, types, headers)[:3] # if not identifiers.validate_list(list(ids)): # exit('ERROR: contig names in the text file did not match dataset identifiers.') results = rows_to_results(rows, id_rows, types, array, field_name) fields = results_to_fields(results, types, cols, headers, text_file, delimiter, identifiers) # meta = {'file': text_file} return fields
def reset_valid_accounts(self): default_accounts = ast.literal_eval( read_file(CONFIG_DEFAULT_ACCOUNTS_FILE)) self.accounts_list = default_accounts self.update_calculation() self.save_accounts_list()
def test_psf_estimation(psf_data, true_psf_file, kernel=None, metric='mean'): """Test PSF Estimation This method tests the quality of the estimated PSFs Parameters ---------- psf_data : np.ndarray Estimated PSFs, 3D array true_psf_file : str True PSFs file name kernel : int, optional Standard deviation of Gaussian kernel metric : str {mean, median}, optional Metric for averaging results (default is 'mean') Returns ------- np.ndarray pixel errors, np.ndarray ellipticity errors Raises ------ ValueError If the number of clean images does not match the number of deconvolved images """ true_psf = read_file(true_psf_file) if true_psf.shape != psf_data.shape: raise ValueError('The number of true PSF images must match the number ' 'estimated PSF images.') return test_images(psf_data, true_psf, kernel, metric)
def apply_filter(ids, text_file, **kwargs): """Filter Text file.""" suffix = kwargs['--suffix'] path = pathlib.Path(text_file) outfile = str(path.parent / (path.stem + '.' + suffix + path.suffix)) data = file_io.read_file(text_file) lines = data.split('\n') delimiter = kwargs['--text-delimiter'] if delimiter == 'whitespace': delimit = re.compile(r'\s+') else: delimit = re.compile(r"%s" % delimiter) id_col = int(kwargs['--text-id-column']) - 1 output = [] if kwargs['--text-header']: header_row = lines.pop(0) header_row.rstrip() output.append(header_row) for line in lines: line = line row = re.split(delimit, line.replace('"', '')) try: if row[id_col] in ids: output.append(line) except IndexError: output.append(line) file_io.write_file(outfile, output, plain=True)
def parse_trnascan(trnascan_file, identifiers): """Parse tRNAscan results into a MultiArray.""" data = file_io.read_file(trnascan_file) lines = data.split('\n') header = True meta = {'file': trnascan_file} results = defaultdict(list) for line in lines: if header: row = re.split(' +', line) if len(row) > 1: if row[1].startswith('v.'): meta.update({'version': row[1]}) elif row[1] == 'Mode:': meta.update({'mode': row[2]}) meta.update({'field_id': "trnascan_%s" % row[2].lower()}) elif row[1].startswith('------'): header = False else: row = re.split(r' +|\t', line) if len(row) == 9: results[row[0]].append([row[4], row[5]]) if not identifiers.validate_list(list(results.keys())): raise UserWarning('Contig names in the tRNAScan file did not match dataset identifiers.') values = [results[id] if id in results else [] for id in identifiers.values] trnascan_field = MultiArray(meta['field_id'], values=values, meta=meta, headers=('tRNA_type', 'Anticodon'), parents=['children'] ) return trnascan_field
def parse_text(text_file, delimiter, columns, header, no_array, identifiers): """Parse text file into Category and/or Variable fields.""" try: text_file, field_name = text_file.split("=") except ValueError: field_name = False data = file_io.read_file(text_file) lines = data.split("\n") delimit = set_delimiter(delimiter, sample=lines[0]) if columns: columns = columns.split(",") else: columns = [] if header: header_row = lines.pop(0).replace('"', "") columns = parse_header_row(delimit, header_row, columns) cols, headers, types, width = map_fields(delimit, lines[0].replace('"', ""), columns) rows, id_rows, array = parse_rows(delimit, lines, width, no_array, cols, types, headers)[:3] # if not identifiers.validate_list(list(ids)): # exit('ERROR: contig names in the text file did not match dataset identifiers.') results = rows_to_results(rows, id_rows, types, array, field_name) fields = results_to_fields(results, types, cols, headers, text_file, delimiter, identifiers) # meta = {'file': text_file} return fields
def init_stop_words(): stop_words = set(stopwords.words('english')) extra_stop_words = read_file('dictionaries/stopwords.txt').split('\n') extra_stop_words = [ unicode(word, 'utf-8') for word in extra_stop_words if word ] stop_words.update(extra_stop_words) return set([normalize_word(word) for word in stop_words])
def get_wllr_in_domain_and_polarity(domain, polarity): wllr = {} file_path = 'WLLR/{}.{}.wllr'.format(domain, polarity) lines = read_file(file_path).split('\n') lines.remove('') for line in lines: word, wllr_index = line.split('\t') wllr[word] = float(wllr_index) return wllr
def get_image_list(file_name): f_list = file_io.read_file(file_name) return_list = list() for f in f_list: f_l = f.split(" ") b_name = os.path.basename(f_l[0]) st_name = data_dir + b_name img_name = data_dir + b_name.replace("_st.data", "_1.jpg") return_list.append(st_name + " " + img_name) return return_list
def test_deconvolution(deconv_data, clean_data_file, random_seed=None, kernel=None, metric='mean'): """Test deconvolution This method tests the quality of the deconvolved images Parameters ---------- deconv_data : np.ndarray Deconvolved data, 3D array clean_data_file : str Clean data file name random_seed : int, optional Random seed kernel : int, optional Standard deviation of Gaussian kernel metric : str {mean, median}, optional Metric for averaging results (default is 'mean') Returns ------- np.ndarray pixel errors, np.ndarray ellipticity errors Raises ------ ValueError If the number of clean images does not match the number of deconvolved images """ if not isinstance(random_seed, type(None)): np.random.seed(random_seed) clean_data = read_file(clean_data_file) clean_data = np.random.permutation(clean_data)[:deconv_data.shape[0]] else: clean_data = read_file(clean_data_file)[:deconv_data.shape[0]] if clean_data.shape != deconv_data.shape: raise ValueError('The number of clean images must match the number ' 'deconvolved images.') return test_images(deconv_data, clean_data, kernel, metric)
def data_cleaning(): """ Data cleaning function. It converts raw data to suitable form and save it in files. Args: No arguments Returns: No returns """ reading_file_pointer = file_io.read_file('./pollution_new.csv') writing_file_pointer = file_io.create_file('./dataset.csv') index = [] days = [] months = [] years = [] hours = [] minutes = [] humidities = [] temperatures = [] ppm = [] line_no = 1 for line in reading_file_pointer: if line_no == 1: line_no += 1 continue words = line.split(',') index.append(words[0].strip()) ppm.append(words[-1].strip()) temperatures.append(words[-2].strip()) humidities.append(words[-3].strip()) date_time = words[1].split(' ') dates = date_time[0].split('/') months.append(dates[0].strip()) days.append(dates[1].strip()) years.append(dates[2].strip()) time = date_time[1].split(':') hours.append(time[0].strip()) minutes.append(time[1].strip()) file_io.write_line(writing_file_pointer, 'index,year,month,day,hour,hum,temp,ppm\n') for i in range(len(index)): line = index[i] + ',' + years[i] + ',' + months[i] + ',' + days[ i] + ',' + hours[i] + ',' + humidities[i] + ',' + temperatures[ i] + ',' + ppm[i] + '\n' file_io.write_line(writing_file_pointer, line)
def read_data(self, data_dir, file_name, data_num): file_list = file_io.read_file(file_name) data_len = min(data_num, len(file_list)) images = np.empty((data_len, 32, 32, 3), np.float32) labels = np.empty((data_len), np.uint8) for i in range(data_len): image_name, label = file_list[i].split(" ") images[i, :, :, :] = cv2.imread(data_dir + image_name) / 255.0 labels[i] = int(label) return images, labels
def parse_busco(busco_file, identifiers): # pylint: disable=too-many-locals """Parse BUSCO results into a MultiArray.""" data = file_io.read_file(busco_file) lines = data.split("\n") version = lines[0].split(":")[1].strip() desc = re.split(r":\s*|\(|\)\s*|,\s*", lines[1]) meta = { "version": version, "set": desc[1].strip(), "count": max(int(desc[5].strip()), int(desc[7].strip())), "file": busco_file, } version = int(version.split(".")[0]) if version < 4: rows = [re.split("\t", line) for line in lines[5:]] meta["set"] = re.search( r"-l\s.*?\/*(\w+_odb\d+)\/", lines[2].split(":")[1].strip() )[1] columns = re.split(r"# |\t", lines[4])[1:] try: contig_index = columns.index("Contig") except ValueError: contig_index = columns.index("Sequence") else: rows = [re.split("\t", line) for line in lines[3:]] columns = re.split(r"# |\t", lines[2])[1:] contig_index = columns.index("Sequence") meta["field_id"] = "%s_busco" % meta["set"] busco_index = columns.index("Busco id") status_index = columns.index("Status") results = defaultdict(list) for row in rows: if len(row) > contig_index: if version < 4: contig = row[contig_index] else: contig = row[contig_index].split(":")[0] results[contig].append([row[busco_index], row[status_index]]) if not identifiers.validate_list(list(results.keys())): raise UserWarning( "Contig names in the Busco file did not match dataset identifiers." ) values = [results[id] if id in results else [] for id in identifiers.values] busco_field = MultiArray( meta["field_id"], values=values, meta=meta, headers=("Busco id", "Status"), parents=["children"], category_slot=1, ) return busco_field
def train_word2vec(filename, n_epochs=1): # read input file observations, states = parse(filename) sentences, labels = read_file(filename) # preprocess token_mapping = get_token_mapping(observations) X = prepare_inputs(token_mapping, sentences) # init model vocab_size = len(token_mapping) W, U = init_vars(vocab_size, latent_size=300) # train for i in trange(n_epochs, desc='Training word2vec'): W, U = train_epoch(i, W, U, X, observations, token_mapping) return W, U
def init_data(d_type, domain, d_name): NUMBER_OF_FILE = 500 positive = [ read_file('{}/{}/{}/{}/{:03d}.txt'.format(d_type, domain, 'positive', d_name, i)) for i in range(NUMBER_OF_FILE) ] negative = [ read_file('{}/{}/{}/{}/{:03d}.txt'.format(d_type, domain, 'negative', d_name, i)) for i in range(NUMBER_OF_FILE) ] positive = positive[:500] negative = negative[:500] positive = [p for p in positive if p] negative = [n for n in negative if n] print 'Number of positive: ', len(positive) print 'Number of negative: ', len(negative) data = positive + negative return data, len(positive), len(negative)
def load_proto(file_name): model_param = dict() param = file_io.read_file(file_name) for li in param: li = li.replace(" ", "") if len(li) == 0 or li[0] == "#": continue name, val = li.split(":") val = check_digit(val) val = check_bool(val) val = check_list(val) val = check_none(val) model_param[name] = val return model_param
def load_proto(file_name): model_param = dict() param = file_io.read_file(file_name) for li in param: li = li.replace(" ", "") name, val = li.split(":") if val.replace(".","").isdigit(): val = float(val) if val.is_integer(): val = int(val) if val == 'true' or val == "True": val = True if val == 'false' or val == "False": val = False model_param[name] = val return model_param
def parse_busco(busco_file, identifiers): """Parse BUSCO results into a MultiArray.""" data = file_io.read_file(busco_file) lines = data.split('\n') rows = [re.split('\t', line) for line in lines[5:]] meta = { 'version': lines[0].split(':')[1].strip(), 'set': re.split(r':|\(|\)', lines[1])[1].strip(), 'count': int(re.split(r':|\(|\)', lines[1])[5].strip()), 'command': lines[2].split(':')[1].strip(), 'file': busco_file } meta['set'] = re.search(r'-l\s.*?\/*(\w+_odb\d+)\/', meta['command'])[1] meta['field_id'] = "%s_busco" % meta['set'] columns = re.split(r'# |\t', lines[4])[1:] busco_index = columns.index('Busco id') status_index = columns.index('Status') contig_index = columns.index('Contig') results = defaultdict(list) for row in rows: if len(row) > contig_index: results[row[contig_index]].append( [row[busco_index], row[status_index]]) if not identifiers.validate_list(list(results.keys())): raise UserWarning( 'Contig names in the Busco file did not match dataset identifiers.' ) values = [ results[id] if id in results else [] for id in identifiers.values ] busco_field = MultiArray(meta['field_id'], values=values, meta=meta, headers=('Busco id', 'Status'), parents=['children'], category_slot=1) return busco_field
def apply_filter(ids, text_file, **kwargs): """Filter Text file.""" suffix = kwargs["--suffix"] path = pathlib.Path(text_file) outfile = str(path.parent / (path.stem + "." + suffix + path.suffix)) data = file_io.read_file(text_file) lines = data.split("\n") delimiter = kwargs["--text-delimiter"] delimit = set_delimiter(delimiter, sample=lines[0]) id_col = int(kwargs["--text-id-column"]) - 1 output = [] if kwargs["--text-header"]: header_row = lines.pop(0) header_row.rstrip() output.append(header_row) for line in lines: line = line row = re.split(delimit, line.replace('"', "")) try: if row[id_col] in ids: output.append(line) except IndexError: output.append(line) file_io.write_file(outfile, output, plain=True)
def test_read_file_content(self): """ test for reading file content """ for line in read_file(self.temp_file_path): self.assertEqual(line, self.file_content)
for i in xrange(input_shape[2]): new_array[:, :, i] = cv2.resize(np_array[:, :, i], tuple(image_size)) return new_array def process_image(image_name): res2a = net.blobs['res2a'].data.squeeze().transpose((1, 2, 0)) res3a = net.blobs['res3a'].data.squeeze().transpose((1, 2, 0)) res4a = net.blobs['res4a'].data.squeeze().transpose((1, 2, 0)) image = caffe.io.load_image(image_name) transformed_image = transformer.preprocess('data', image) net.blobs['data'].data[...] = transformed_image net.forward() res2a = resize_image(res2a, output_shape) res3a = resize_image(res3a, output_shape) res4a = resize_image(res4a, output_shape) """ shape of hypercolumn is (56, 56, 1792) """ hypercolumn = np.concatenate((res2a, res3a, res4a), 2) return hypercolumn image_name_list = file_io.read_file(file_list_name) count = 0 for image_name in image_name_list: hypercolumn = process_image(image_name) feature_name = image_name.replace(".jpg", ".resnet_hypercolumn") hypercolumn.tofile(feature_name) count = count + 1 print("count: %d / %d" % (count, len(image_name_list)))
res4a = resize_image(res4a, output_shape) #res5a = resize_image(res5a, output_shape) #res2a = imresize(res2a, output_shape) #res3a = imresize(res3a, output_shape) #res4a = imresize(res4a, output_shape) #res5a = imresize(res5a, output_shape + (output_shape[2], 1)) #res2a = cv2.resize(res2a, output_shape) #res3a = cv2.resize(res3a, output_shape) #res4a = cv2.resize(res4a, output_shape) #res5a = cv2.resize(res5a, output_shape) #hypercolumn = np.concatenate((conv1, res2a, res3a, res4a, res5a), 2) """ shape of hypercolumn is (56, 56, 1792) """ hypercolumn = np.concatenate((res2a, res3a, res4a), 2) return hypercolumn image_name_list = file_io.read_file("../file_list/image_name_list.txt") count = 0 for image_name in image_name_list: hypercolumn = process_image(image_name) feature_name = image_name.replace(".jpg", ".resnet_hypercolumn") hypercolumn.tofile(feature_name) count = count + 1 print("count: %d / %d" % (count, len(image_name_list))) #print(res2a.shape) #print(res3a.shape) #print(res4a.shape) #print(res5a.shape)
import file_io import numpy as np import matplotlib.pyplot as plt result_list = file_io.read_file("resdeconv_results/results.txt") result_list.sort() #result_list = file_io.read_file("results/results.txt") diff_list = list() true_count = list() estimate_count = list() for result in result_list: img_name, label, infer = result.split(" ") diff = abs(float(label) - float(infer)) diff_list.append(diff) true_count.append(float(label)) estimate_count.append(float(infer)) diff_list = np.array(diff_list) print(np.mean(diff_list)) print(np.mean(np.square(diff_list))) plt.plot(true_count, 'g') plt.plot(estimate_count, 'r') plt.show()
def test_deconvolution(deconv_data, clean_data_file, random_seed=None, kernel=None, metric='mean'): """Test deconvolution This method tests the quality of the deconvolved images Parameters ---------- deconv_data : np.ndarray Deconvolved data, 3D array clean_data_file : str Clean data file name random_seed : int, optional Random seed kernel : int, optional Standard deviation of Gaussian kernel metric : str {mean, median}, optional Metric for averaging results (default is 'mean') Returns ------- np.ndarray pixel errors, np.ndarray ellipticity errors Raises ------ ValueError If the number of clean images does not match the number of deconvolved images """ if not isinstance(random_seed, type(None)): np.random.seed(random_seed) clean_data = read_file(clean_data_file) clean_data = np.random.permutation(clean_data)[:deconv_data.shape[0]] else: clean_data = read_file(clean_data_file)[:deconv_data.shape[0]] if clean_data.shape != deconv_data.shape: raise ValueError('The number of clean images must match the number ' 'deconvolved images.') if not isinstance(kernel, type(None)): def add_weights(data, weight): return np.array([x * weight for x in data]) gk = gaussian_kernel(clean_data[0].shape, kernel) deconv_data = add_weights(deconv_data, gk) clean_data = add_weights(clean_data, gk) if metric == 'median': metric = np.median else: metric = np.mean px_err = nmse(clean_data, deconv_data, metric) ellip_err = e_error(clean_data, deconv_data, metric) psnr = psnr_stack(clean_data, deconv_data, metric) return (px_err, ellip_err, psnr)
def init_set_in_file(file_path): words = read_file(file_path).split('\n') return set([word for word in words if word])
def __init__(self): self.html = markdown.markdown(read_file('README.md')) accounts = get_accounts_from_file() self.accounts_list = [ account for account in accounts if account.strip() != '' ]