Exemplo n.º 1
0
def get_accounts_from_file():
    if os.path.isfile(CONFIG_ACCOUNTS_FILE):
        file_content = read_file(CONFIG_ACCOUNTS_FILE)
    else:
        file_content = read_file(CONFIG_DEFAULT_ACCOUNTS_FILE)
        write_file(CONFIG_ACCOUNTS_FILE, file_content)

    return ast.literal_eval(file_content)
Exemplo n.º 2
0
def get_wllr_ranking(domain):
    ranking = {}

    file_path = 'WLLR/{}.positive.wllr'.format(domain)
    lines = read_file(file_path).split('\n')
    lines.remove('')
    ranking['positive'] = [line.split('\t')[0] for line in lines]

    file_path = 'WLLR/{}.negative.wllr'.format(domain)
    lines = read_file(file_path).split('\n')
    lines.remove('')
    ranking['negative'] = [line.split('\t')[0] for line in lines]

    return ranking
Exemplo n.º 3
0
def main():
    # read file
    observations, states = parse(train_filename)
    train_sentences, train_labels = read_file(train_filename)

    val_sentences, val_labels = read_file(val_filename)
    test_sentences, original_sentences, test_labels = read_file(test_filename,
                                                                test=True)

    # preprocess
    token_mapping = get_token_mapping(observations)
    X_train = prepare_inputs(token_mapping, w2v_W, w2v_U, train_sentences)

    state_mapping = {state: i for i, state in enumerate(states)}
    y_train = prepare_labels(state_mapping, train_labels)

    X_val = prepare_inputs(token_mapping, w2v_W, w2v_U, val_sentences)
    y_val = prepare_labels(state_mapping, val_labels)

    X_test = prepare_inputs(token_mapping, w2v_W, w2v_U, test_sentences)

    # train model
    model = init_vars(input_size=300,
                      output_size=len(state_mapping),
                      n_hidden=n_hidden)

    X_flat, y_flat = [], []
    for sentence in X_train:
        X_flat.extend(sentence)
    X_flat = np.asarray(X_flat)
    for sentence in y_train:
        y_flat.extend(sentence)
    y_flat = np.asarray(y_flat)

    checkpoints, losses, scores = train(model, X_flat, y_flat, X_val, X_test,
                                        state_mapping, val_sentences,
                                        original_sentences, n_epochs,
                                        batch_size)

    f_entity = [tup[0][-1] for tup in scores]
    f_type = [tup[1][-1] for tup in scores]
    print('Entity:', (np.argmax(f_entity), np.max(f_entity)))
    print(f_entity)
    print('Entity type:', (np.argmax(f_type), np.max(f_type)))
    print(f_type)
    averaged = (np.array(f_type) + np.array(f_entity)) / 2
    print("Average:", (np.argmax(averaged), np.max(averaged)))
    print(averaged)
Exemplo n.º 4
0
def parse_text(text_file, delimiter, columns, header, no_array, identifiers):
    """Parse text file into Category and/or Variable fields."""
    try:
        text_file, field_name = text_file.split('=')
    except ValueError:
        field_name = False
    data = file_io.read_file(text_file)
    lines = data.split('\n')
    if delimiter == 'whitespace':
        delimit = re.compile(r'\s+')
    else:
        delimit = re.compile(r"%s" % delimiter)
    columns = columns.split(',')
    if header:
        header_row = lines.pop(0).replace('"', '')
        columns = parse_header_row(delimiter, header_row, columns)
    cols, headers, types, width = map_fields(delimit,
                                             lines[0].replace('"',
                                                              ''), columns)
    rows, id_rows, array = parse_rows(delimit, lines, width, no_array, cols,
                                      types, headers)[:3]
    # if not identifiers.validate_list(list(ids)):
    #     exit('ERROR: contig names in the text file did not match dataset identifiers.')
    results = rows_to_results(rows, id_rows, types, array, field_name)
    fields = results_to_fields(results, types, cols, headers, text_file,
                               delimiter, identifiers)
    # meta = {'file': text_file}
    return fields
Exemplo n.º 5
0
    def reset_valid_accounts(self):
        default_accounts = ast.literal_eval(
            read_file(CONFIG_DEFAULT_ACCOUNTS_FILE))
        self.accounts_list = default_accounts

        self.update_calculation()
        self.save_accounts_list()
Exemplo n.º 6
0
def test_psf_estimation(psf_data, true_psf_file, kernel=None, metric='mean'):
    """Test PSF Estimation

    This method tests the quality of the estimated PSFs

    Parameters
    ----------
    psf_data : np.ndarray
        Estimated PSFs, 3D array
    true_psf_file : str
        True PSFs file name
    kernel : int, optional
        Standard deviation of Gaussian kernel
    metric : str {mean, median}, optional
        Metric for averaging results (default is 'mean')

    Returns
    -------
    np.ndarray pixel errors, np.ndarray ellipticity errors

    Raises
    ------
    ValueError
        If the number of clean images does not match the number of deconvolved
        images

    """

    true_psf = read_file(true_psf_file)

    if true_psf.shape != psf_data.shape:
        raise ValueError('The number of true PSF images must match the number '
                         'estimated PSF images.')

    return test_images(psf_data, true_psf, kernel, metric)
Exemplo n.º 7
0
def apply_filter(ids, text_file, **kwargs):
    """Filter Text file."""
    suffix = kwargs['--suffix']
    path = pathlib.Path(text_file)
    outfile = str(path.parent / (path.stem + '.' + suffix + path.suffix))
    data = file_io.read_file(text_file)
    lines = data.split('\n')
    delimiter = kwargs['--text-delimiter']
    if delimiter == 'whitespace':
        delimit = re.compile(r'\s+')
    else:
        delimit = re.compile(r"%s" % delimiter)
    id_col = int(kwargs['--text-id-column']) - 1
    output = []
    if kwargs['--text-header']:
        header_row = lines.pop(0)
        header_row.rstrip()
        output.append(header_row)
    for line in lines:
        line = line
        row = re.split(delimit, line.replace('"', ''))
        try:
            if row[id_col] in ids:
                output.append(line)
        except IndexError:
            output.append(line)
    file_io.write_file(outfile, output, plain=True)
Exemplo n.º 8
0
def parse_trnascan(trnascan_file, identifiers):
    """Parse tRNAscan results into a MultiArray."""
    data = file_io.read_file(trnascan_file)
    lines = data.split('\n')
    header = True
    meta = {'file': trnascan_file}
    results = defaultdict(list)
    for line in lines:
        if header:
            row = re.split(' +', line)
            if len(row) > 1:
                if row[1].startswith('v.'):
                    meta.update({'version': row[1]})
                elif row[1] == 'Mode:':
                    meta.update({'mode': row[2]})
                    meta.update({'field_id': "trnascan_%s" % row[2].lower()})
                elif row[1].startswith('------'):
                    header = False
        else:
            row = re.split(r' +|\t', line)
            if len(row) == 9:
                results[row[0]].append([row[4], row[5]])
    if not identifiers.validate_list(list(results.keys())):
        raise UserWarning('Contig names in the tRNAScan file did not match dataset identifiers.')
    values = [results[id] if id in results else [] for id in identifiers.values]
    trnascan_field = MultiArray(meta['field_id'],
                                values=values,
                                meta=meta,
                                headers=('tRNA_type', 'Anticodon'),
                                parents=['children']
                                )
    return trnascan_field
Exemplo n.º 9
0
def parse_text(text_file, delimiter, columns, header, no_array, identifiers):
    """Parse text file into Category and/or Variable fields."""
    try:
        text_file, field_name = text_file.split("=")
    except ValueError:
        field_name = False
    data = file_io.read_file(text_file)
    lines = data.split("\n")
    delimit = set_delimiter(delimiter, sample=lines[0])
    if columns:
        columns = columns.split(",")
    else:
        columns = []
    if header:
        header_row = lines.pop(0).replace('"', "")
        columns = parse_header_row(delimit, header_row, columns)
    cols, headers, types, width = map_fields(delimit,
                                             lines[0].replace('"',
                                                              ""), columns)
    rows, id_rows, array = parse_rows(delimit, lines, width, no_array, cols,
                                      types, headers)[:3]
    # if not identifiers.validate_list(list(ids)):
    #     exit('ERROR: contig names in the text file did not match dataset identifiers.')
    results = rows_to_results(rows, id_rows, types, array, field_name)
    fields = results_to_fields(results, types, cols, headers, text_file,
                               delimiter, identifiers)
    # meta = {'file': text_file}
    return fields
def init_stop_words():
    stop_words = set(stopwords.words('english'))
    extra_stop_words = read_file('dictionaries/stopwords.txt').split('\n')
    extra_stop_words = [
        unicode(word, 'utf-8') for word in extra_stop_words if word
    ]
    stop_words.update(extra_stop_words)
    return set([normalize_word(word) for word in stop_words])
Exemplo n.º 11
0
def get_wllr_in_domain_and_polarity(domain, polarity):
    wllr = {}
    file_path = 'WLLR/{}.{}.wllr'.format(domain, polarity)
    lines = read_file(file_path).split('\n')
    lines.remove('')
    for line in lines:
        word, wllr_index = line.split('\t')
        wllr[word] = float(wllr_index)
    return wllr
Exemplo n.º 12
0
def get_image_list(file_name):
    f_list = file_io.read_file(file_name)
    return_list = list()
    for f in f_list:
        f_l = f.split(" ")
        b_name = os.path.basename(f_l[0])
        st_name = data_dir + b_name
        img_name = data_dir + b_name.replace("_st.data", "_1.jpg")
        return_list.append(st_name + " " + img_name)
    return return_list
Exemplo n.º 13
0
def test_deconvolution(deconv_data, clean_data_file,
                       random_seed=None, kernel=None, metric='mean'):
    """Test deconvolution

    This method tests the quality of the deconvolved images

    Parameters
    ----------
    deconv_data : np.ndarray
        Deconvolved data, 3D array
    clean_data_file : str
        Clean data file name
    random_seed : int, optional
        Random seed
    kernel : int, optional
        Standard deviation of Gaussian kernel
    metric : str {mean, median}, optional
        Metric for averaging results (default is 'mean')

    Returns
    -------
    np.ndarray pixel errors, np.ndarray ellipticity errors

    Raises
    ------
    ValueError
        If the number of clean images does not match the number of deconvolved
        images

    """

    if not isinstance(random_seed, type(None)):
        np.random.seed(random_seed)
        clean_data = read_file(clean_data_file)
        clean_data = np.random.permutation(clean_data)[:deconv_data.shape[0]]
    else:
        clean_data = read_file(clean_data_file)[:deconv_data.shape[0]]

    if clean_data.shape != deconv_data.shape:
        raise ValueError('The number of clean images must match the number '
                         'deconvolved images.')

    return test_images(deconv_data, clean_data, kernel, metric)
Exemplo n.º 14
0
def data_cleaning():
    """
	Data cleaning function. It converts raw data to suitable form and save it in files.
	
	Args:
		No arguments

	Returns:
		No returns
	"""
    reading_file_pointer = file_io.read_file('./pollution_new.csv')
    writing_file_pointer = file_io.create_file('./dataset.csv')

    index = []
    days = []
    months = []
    years = []
    hours = []
    minutes = []
    humidities = []
    temperatures = []
    ppm = []

    line_no = 1
    for line in reading_file_pointer:
        if line_no == 1:
            line_no += 1
            continue

        words = line.split(',')
        index.append(words[0].strip())
        ppm.append(words[-1].strip())
        temperatures.append(words[-2].strip())
        humidities.append(words[-3].strip())

        date_time = words[1].split(' ')
        dates = date_time[0].split('/')

        months.append(dates[0].strip())
        days.append(dates[1].strip())
        years.append(dates[2].strip())

        time = date_time[1].split(':')
        hours.append(time[0].strip())
        minutes.append(time[1].strip())

    file_io.write_line(writing_file_pointer,
                       'index,year,month,day,hour,hum,temp,ppm\n')

    for i in range(len(index)):
        line = index[i] + ',' + years[i] + ',' + months[i] + ',' + days[
            i] + ',' + hours[i] + ',' + humidities[i] + ',' + temperatures[
                i] + ',' + ppm[i] + '\n'
        file_io.write_line(writing_file_pointer, line)
Exemplo n.º 15
0
    def read_data(self, data_dir, file_name, data_num):
        file_list = file_io.read_file(file_name)
        data_len = min(data_num, len(file_list))
        images = np.empty((data_len, 32, 32, 3), np.float32)
        labels = np.empty((data_len), np.uint8)
        for i in range(data_len):
            image_name, label = file_list[i].split(" ")
            images[i, :, :, :] = cv2.imread(data_dir + image_name) / 255.0
            labels[i] = int(label)

        return images, labels
Exemplo n.º 16
0
def parse_busco(busco_file, identifiers):  # pylint: disable=too-many-locals
    """Parse BUSCO results into a MultiArray."""
    data = file_io.read_file(busco_file)
    lines = data.split("\n")
    version = lines[0].split(":")[1].strip()
    desc = re.split(r":\s*|\(|\)\s*|,\s*", lines[1])
    meta = {
        "version": version,
        "set": desc[1].strip(),
        "count": max(int(desc[5].strip()), int(desc[7].strip())),
        "file": busco_file,
    }
    version = int(version.split(".")[0])
    if version < 4:
        rows = [re.split("\t", line) for line in lines[5:]]
        meta["set"] = re.search(
            r"-l\s.*?\/*(\w+_odb\d+)\/", lines[2].split(":")[1].strip()
        )[1]
        columns = re.split(r"# |\t", lines[4])[1:]
        try:
            contig_index = columns.index("Contig")
        except ValueError:
            contig_index = columns.index("Sequence")
    else:
        rows = [re.split("\t", line) for line in lines[3:]]
        columns = re.split(r"# |\t", lines[2])[1:]
        contig_index = columns.index("Sequence")
    meta["field_id"] = "%s_busco" % meta["set"]
    busco_index = columns.index("Busco id")
    status_index = columns.index("Status")
    results = defaultdict(list)
    for row in rows:
        if len(row) > contig_index:
            if version < 4:
                contig = row[contig_index]
            else:
                contig = row[contig_index].split(":")[0]
            results[contig].append([row[busco_index], row[status_index]])
    if not identifiers.validate_list(list(results.keys())):
        raise UserWarning(
            "Contig names in the Busco file did not match dataset identifiers."
        )
    values = [results[id] if id in results else [] for id in identifiers.values]
    busco_field = MultiArray(
        meta["field_id"],
        values=values,
        meta=meta,
        headers=("Busco id", "Status"),
        parents=["children"],
        category_slot=1,
    )
    return busco_field
Exemplo n.º 17
0
def train_word2vec(filename, n_epochs=1):
    # read input file
    observations, states = parse(filename)
    sentences, labels = read_file(filename)
    # preprocess
    token_mapping = get_token_mapping(observations)
    X = prepare_inputs(token_mapping, sentences)
    # init model
    vocab_size = len(token_mapping)
    W, U = init_vars(vocab_size, latent_size=300)
    # train
    for i in trange(n_epochs, desc='Training word2vec'):
        W, U = train_epoch(i, W, U, X, observations, token_mapping)
    return W, U
Exemplo n.º 18
0
def init_data(d_type, domain, d_name):
    NUMBER_OF_FILE = 500
    positive = [
        read_file('{}/{}/{}/{}/{:03d}.txt'.format(d_type, domain, 'positive',
                                                  d_name, i))
        for i in range(NUMBER_OF_FILE)
    ]
    negative = [
        read_file('{}/{}/{}/{}/{:03d}.txt'.format(d_type, domain, 'negative',
                                                  d_name, i))
        for i in range(NUMBER_OF_FILE)
    ]

    positive = positive[:500]
    negative = negative[:500]

    positive = [p for p in positive if p]
    negative = [n for n in negative if n]

    print 'Number of positive: ', len(positive)
    print 'Number of negative: ', len(negative)

    data = positive + negative
    return data, len(positive), len(negative)
Exemplo n.º 19
0
def load_proto(file_name):
    model_param = dict()
    param = file_io.read_file(file_name)
    for li in param:
        li = li.replace(" ", "")
        if len(li) == 0 or li[0] == "#":
            continue
        name, val = li.split(":")

        val = check_digit(val)
        val = check_bool(val)
        val = check_list(val)
        val = check_none(val)

        model_param[name] = val

    return model_param
Exemplo n.º 20
0
def load_proto(file_name):
    model_param = dict()
    param = file_io.read_file(file_name)
    for li in param:
        li = li.replace(" ", "")
        name, val = li.split(":")
        if val.replace(".","").isdigit():
            val = float(val)
            if val.is_integer():
                val = int(val)
        if val == 'true' or val == "True":
            val = True
        if val == 'false' or val == "False":
            val = False

        model_param[name] = val
    return model_param
Exemplo n.º 21
0
def parse_busco(busco_file, identifiers):
    """Parse BUSCO results into a MultiArray."""
    data = file_io.read_file(busco_file)
    lines = data.split('\n')
    rows = [re.split('\t', line) for line in lines[5:]]
    meta = {
        'version': lines[0].split(':')[1].strip(),
        'set': re.split(r':|\(|\)', lines[1])[1].strip(),
        'count': int(re.split(r':|\(|\)', lines[1])[5].strip()),
        'command': lines[2].split(':')[1].strip(),
        'file': busco_file
    }
    meta['set'] = re.search(r'-l\s.*?\/*(\w+_odb\d+)\/', meta['command'])[1]
    meta['field_id'] = "%s_busco" % meta['set']
    columns = re.split(r'# |\t', lines[4])[1:]
    busco_index = columns.index('Busco id')
    status_index = columns.index('Status')
    contig_index = columns.index('Contig')
    results = defaultdict(list)
    for row in rows:
        if len(row) > contig_index:
            results[row[contig_index]].append(
                [row[busco_index], row[status_index]])
    if not identifiers.validate_list(list(results.keys())):
        raise UserWarning(
            'Contig names in the Busco file did not match dataset identifiers.'
        )
    values = [
        results[id] if id in results else [] for id in identifiers.values
    ]
    busco_field = MultiArray(meta['field_id'],
                             values=values,
                             meta=meta,
                             headers=('Busco id', 'Status'),
                             parents=['children'],
                             category_slot=1)
    return busco_field
Exemplo n.º 22
0
def apply_filter(ids, text_file, **kwargs):
    """Filter Text file."""
    suffix = kwargs["--suffix"]
    path = pathlib.Path(text_file)
    outfile = str(path.parent / (path.stem + "." + suffix + path.suffix))
    data = file_io.read_file(text_file)
    lines = data.split("\n")
    delimiter = kwargs["--text-delimiter"]
    delimit = set_delimiter(delimiter, sample=lines[0])
    id_col = int(kwargs["--text-id-column"]) - 1
    output = []
    if kwargs["--text-header"]:
        header_row = lines.pop(0)
        header_row.rstrip()
        output.append(header_row)
    for line in lines:
        line = line
        row = re.split(delimit, line.replace('"', ""))
        try:
            if row[id_col] in ids:
                output.append(line)
        except IndexError:
            output.append(line)
    file_io.write_file(outfile, output, plain=True)
Exemplo n.º 23
0
    def test_read_file_content(self):
        """ test for reading file content """

        for line in read_file(self.temp_file_path):
            self.assertEqual(line, self.file_content)
    for i in xrange(input_shape[2]):
        new_array[:, :, i] = cv2.resize(np_array[:, :, i], tuple(image_size))
    return new_array


def process_image(image_name):
    res2a = net.blobs['res2a'].data.squeeze().transpose((1, 2, 0))
    res3a = net.blobs['res3a'].data.squeeze().transpose((1, 2, 0))
    res4a = net.blobs['res4a'].data.squeeze().transpose((1, 2, 0))

    image = caffe.io.load_image(image_name)
    transformed_image = transformer.preprocess('data', image)
    net.blobs['data'].data[...] = transformed_image
    net.forward()
    res2a = resize_image(res2a, output_shape)
    res3a = resize_image(res3a, output_shape)
    res4a = resize_image(res4a, output_shape)
    """ shape of hypercolumn is (56, 56, 1792) """
    hypercolumn = np.concatenate((res2a, res3a, res4a), 2)
    return hypercolumn


image_name_list = file_io.read_file(file_list_name)
count = 0
for image_name in image_name_list:
    hypercolumn = process_image(image_name)
    feature_name = image_name.replace(".jpg", ".resnet_hypercolumn")
    hypercolumn.tofile(feature_name)
    count = count + 1
    print("count: %d / %d" % (count, len(image_name_list)))
    res4a = resize_image(res4a, output_shape)
    #res5a = resize_image(res5a, output_shape)

    #res2a = imresize(res2a, output_shape)
    #res3a = imresize(res3a, output_shape)
    #res4a = imresize(res4a, output_shape)
    #res5a = imresize(res5a, output_shape + (output_shape[2], 1))
    #res2a = cv2.resize(res2a, output_shape)
    #res3a = cv2.resize(res3a, output_shape)
    #res4a = cv2.resize(res4a, output_shape)
    #res5a = cv2.resize(res5a, output_shape)

    #hypercolumn = np.concatenate((conv1, res2a, res3a, res4a, res5a), 2)
    """ shape of hypercolumn is (56, 56, 1792) """
    hypercolumn = np.concatenate((res2a, res3a, res4a), 2)
    return hypercolumn


image_name_list = file_io.read_file("../file_list/image_name_list.txt")
count = 0
for image_name in image_name_list:
    hypercolumn = process_image(image_name)
    feature_name = image_name.replace(".jpg", ".resnet_hypercolumn")
    hypercolumn.tofile(feature_name)
    count = count + 1
    print("count: %d / %d" % (count, len(image_name_list)))
#print(res2a.shape)
#print(res3a.shape)
#print(res4a.shape)
#print(res5a.shape)
Exemplo n.º 26
0
import file_io
import numpy as np
import matplotlib.pyplot as plt

result_list = file_io.read_file("resdeconv_results/results.txt")
result_list.sort()
#result_list = file_io.read_file("results/results.txt")
diff_list = list()
true_count = list()
estimate_count = list()
for result in result_list:
    img_name, label, infer = result.split(" ")
    diff = abs(float(label) - float(infer))
    diff_list.append(diff)
    true_count.append(float(label))
    estimate_count.append(float(infer))

diff_list = np.array(diff_list)
print(np.mean(diff_list))
print(np.mean(np.square(diff_list)))

plt.plot(true_count, 'g')
plt.plot(estimate_count, 'r')
plt.show()
Exemplo n.º 27
0
def test_deconvolution(deconv_data,
                       clean_data_file,
                       random_seed=None,
                       kernel=None,
                       metric='mean'):
    """Test deconvolution

    This method tests the quality of the deconvolved images

    Parameters
    ----------
    deconv_data : np.ndarray
        Deconvolved data, 3D array
    clean_data_file : str
        Clean data file name
    random_seed : int, optional
        Random seed
    kernel : int, optional
        Standard deviation of Gaussian kernel
    metric : str {mean, median}, optional
        Metric for averaging results (default is 'mean')

    Returns
    -------
    np.ndarray pixel errors, np.ndarray ellipticity errors

    Raises
    ------
    ValueError
        If the number of clean images does not match the number of deconvolved
        images

    """

    if not isinstance(random_seed, type(None)):
        np.random.seed(random_seed)
        clean_data = read_file(clean_data_file)
        clean_data = np.random.permutation(clean_data)[:deconv_data.shape[0]]
    else:
        clean_data = read_file(clean_data_file)[:deconv_data.shape[0]]

    if clean_data.shape != deconv_data.shape:
        raise ValueError('The number of clean images must match the number '
                         'deconvolved images.')

    if not isinstance(kernel, type(None)):

        def add_weights(data, weight):

            return np.array([x * weight for x in data])

        gk = gaussian_kernel(clean_data[0].shape, kernel)

        deconv_data = add_weights(deconv_data, gk)
        clean_data = add_weights(clean_data, gk)

    if metric == 'median':
        metric = np.median
    else:
        metric = np.mean

    px_err = nmse(clean_data, deconv_data, metric)
    ellip_err = e_error(clean_data, deconv_data, metric)
    psnr = psnr_stack(clean_data, deconv_data, metric)

    return (px_err, ellip_err, psnr)
def init_set_in_file(file_path):
    words = read_file(file_path).split('\n')
    return set([word for word in words if word])
Exemplo n.º 29
0
 def __init__(self):
     self.html = markdown.markdown(read_file('README.md'))
     accounts = get_accounts_from_file()
     self.accounts_list = [
         account for account in accounts if account.strip() != ''
     ]