Пример #1
0
    def tau_filt(self, tau_d = None):
        if tau_d == None:
            tau_d = self._taus
        t_low = 0.001
        t_hi = 0.01
        median_taus = []
        num_taus = []
        proto_entry = tau_d[tau_d.keys()[0]][0]
        det_taus = {det:[] for det in range(len(proto_entry))}
        for k in tau_d:
            for taus in tau_d[k]:
                for det in range(len(taus)):
                    det_taus[det] += [taus[det]]
        medians = []
        means = []
        var = []
        iqr = []
        for k in det_taus:
            medians += [np.median(det_taus[k])]
            means += [np.mean(det_taus[k])]
            var += [np.var(det_taus[k])]
            iqr += [stats.iqr(det_taus[k])]

        #t_low = np.array(means) - np.array(iqr)
        #t_hi = np.array(means) + np.array(iqr)
        t_low = np.array(means) - 4 * np.sqrt(np.array(var))
        t_hi = np.array(means) + 4 * np.sqrt(np.array(var))
        for k in tau_d:
            count = 0
            for taus in tau_d[k]:
                b = (taus > t_low) * (taus < t_hi)
                self._cuts[k][count] *= b
                count += 1
        return medians, means, var, iqr
Пример #2
0
    def prepare_y_data(self, chunk_len):
        # update scaling stats
        self.stats_update_counter += chunk_len
        if self.stats_update_counter > self.n_samples//3:
            self.mean = np.nanmean(self.y_raw_buffer, 0)
            self.iqr = stats.iqr(self.y_raw_buffer, 0, rng=(0, 100), nan_policy='omit')
            self.iqr[self.iqr <=0 ] = 1
            self.stats_update_counter = 0

        # return scaled signals
        return ((self.y_raw_buffer - self.mean) / self.iqr)[:, self.c_slice]
Пример #3
0
def parse_kallisto_stats(abundance):
    import numpy as np

    stats = dict()
    df = pd.read_table(abundance, sep="\t")
    stats['transcripts'] = df.shape[0]
    stats['zero-count_transcripts'] = (df['est_counts'] == 0).sum()
    stats['non-zero-count_transcripts'] = (df['est_counts'] > 0).sum()
    log_tpm = np.log2(1 + df['tpm'])
    stats['log2tpm_mean'] = log_tpm.mean()
    stats['log2tpm_median'] = log_tpm.median()
    p_log_tpm = np.log2(1 + df['tpm'].where(lambda x: x > 0)).dropna()
    stats['non-zero_log2tpm_mean'] = p_log_tpm.mean()
    stats['non-zero_log2tpm_median'] = p_log_tpm.median()
    try:
        from scipy.stats import iqr
        stats['log2tpm_iqr'] = iqr(log_tpm)
        stats['non-zero_log2tpm_iqr'] = iqr(p_log_tpm)
    except ImportError:
        stats['log2tpm_iqr'] = np.nan
        stats['non-zero_log2tpm_iqr'] = np.nan

    return stats
Пример #4
0
    def get_outliers(self, data, thr):
        """Detects outlier points based on chosen method and theshold."""

        # get outlier threshold
        iqr = stats.iqr(data, nan_policy='omit') # inter-quartile range
        bound = thr * iqr # multiple of IQR
        high_bound = np.percentile(data[~np.isnan(data)], 75) + bound
        low_bound = np.percentile(data[~np.isnan(data)], 25) - bound
        # compare data to bounds
        old_settings = np.seterr(invalid='ignore')
        outliers = np.where((data < low_bound) | (data > high_bound))[0]
        np.seterr(**old_settings)

        return outliers
Пример #5
0
def interquartile_range(full_list_X, full_list_Y):
    rowx = list()
    rowy = list()
    iqrX = list()
    iqrY = list()
    for i in range(len(full_list_X)):
        x = full_list_X[i]
        rowx = (stats.iqr(x, axis=0)).tolist()
        temp = zip(*(full_list_Y[i]))
        y1 = list(temp[0])
        iqrY.append(y1[0])
        iqrX.append(rowx)
        rowx = list()
        rowy = list()

    return iqrX, iqrY
def add_features_in_group(features, gr_, feature_name, aggs, prefix):
    for agg in aggs:
        if agg == 'sum':
            features['{}{}_sum'.format(prefix, feature_name)] = gr_[feature_name].sum()
        elif agg == 'mean':
            features['{}{}_mean'.format(prefix, feature_name)] = gr_[feature_name].mean()
        elif agg == 'max':
            features['{}{}_max'.format(prefix, feature_name)] = gr_[feature_name].max()
        elif agg == 'min':
            features['{}{}_min'.format(prefix, feature_name)] = gr_[feature_name].min()
        elif agg == 'std':
            features['{}{}_std'.format(prefix, feature_name)] = gr_[feature_name].std()
        elif agg == 'count':
            features['{}{}_count'.format(prefix, feature_name)] = gr_[feature_name].count()
        elif agg == 'skew':
            features['{}{}_skew'.format(prefix, feature_name)] = skew(gr_[feature_name])
        elif agg == 'kurt':
            features['{}{}_kurt'.format(prefix, feature_name)] = kurtosis(gr_[feature_name])
        elif agg == 'iqr':
            features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])
        elif agg == 'median':
            features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()

    return features
Пример #7
0
def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
    """
    Compute the Epps-Singleton (ES) test statistic.

    Test the null hypothesis that two samples have the same underlying
    probability distribution.

    Parameters
    ----------
    x, y : array-like
        The two samples of observations to be tested. Input must not have more
        than one dimension. Samples can have different lengths.
    t : array-like, optional
        The points (t1, ..., tn) where the empirical characteristic function is
        to be evaluated. It should be positive distinct numbers. The default
        value (0.4, 0.8) is proposed in [1]_. Input must not have more than
        one dimension.

    Returns
    -------
    statistic : float
        The test statistic.
    pvalue : float
        The associated p-value based on the asymptotic chi2-distribution.

    See Also
    --------
    ks_2samp, anderson_ksamp

    Notes
    -----
    Testing whether two samples are generated by the same underlying
    distribution is a classical question in statistics. A widely used test is
    the Kolmogorov-Smirnov (KS) test which relies on the empirical
    distribution function. Epps and Singleton introduce a test based on the
    empirical characteristic function in [1]_.

    One advantage of the ES test compared to the KS test is that is does
    not assume a continuous distribution. In [1]_, the authors conclude
    that the test also has a higher power than the KS test in many
    examples. They recommend the use of the ES test for discrete samples as
    well as continuous samples with at least 25 observations each, whereas
    `anderson_ksamp` is recommended for smaller sample sizes in the
    continuous case.

    The p-value is computed from the asymptotic distribution of the test
    statistic which follows a `chi2` distribution. If the sample size of both
    `x` and `y` is below 25, the small sample correction proposed in [1]_ is
    applied to the test statistic.

    The default values of `t` are determined in [1]_ by considering
    various distributions and finding good values that lead to a high power
    of the test in general. Table III in [1]_ gives the optimal values for
    the distributions tested in that study. The values of `t` are scaled by
    the semi-interquartile range in the implementation, see [1]_.

    References
    ----------
    .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
       problem using the empirical characteristic function", Journal of
       Statistical Computation and Simulation 26, p. 177--203, 1986.

    .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
       - the Epps-Singleton two-sample test using the empirical characteristic
       function", The Stata Journal 9(3), p. 454--465, 2009.

    """

    x, y, t = np.asarray(x), np.asarray(y), np.asarray(t)
    # check if x and y are valid inputs
    if x.ndim > 1:
        raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim))
    if y.ndim > 1:
        raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim))
    nx, ny = len(x), len(y)
    if (nx < 5) or (ny < 5):
        raise ValueError('x and y should have at least 5 elements, but len(x) '
                         '= {} and len(y) = {}.'.format(nx, ny))
    if not np.isfinite(x).all():
        raise ValueError('x must not contain nonfinite values.')
    if not np.isfinite(y).all():
        raise ValueError('y must not contain nonfinite values.')
    n = nx + ny

    # check if t is valid
    if t.ndim > 1:
        raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim))
    if np.less_equal(t, 0).any():
        raise ValueError('t must contain positive elements only.')

    # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
    # circular import
    from scipy.stats import iqr
    sigma = iqr(np.hstack((x, y))) / 2
    ts = np.reshape(t, (-1, 1)) / sigma

    # covariance estimation of ES test
    gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T  # shape = (nx, 2*len(t))
    gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
    cov_x = np.cov(gx.T, bias=True)  # the test uses biased cov-estimate
    cov_y = np.cov(gy.T, bias=True)
    est_cov = (n/nx)*cov_x + (n/ny)*cov_y
    est_cov_inv = np.linalg.pinv(est_cov)
    r = np.linalg.matrix_rank(est_cov_inv)
    if r < 2*len(t):
        warnings.warn('Estimated covariance matrix does not have full rank. '
                      'This indicates a bad choice of the input t and the '
                      'test might not be consistent.')  # see p. 183 in [1]_

    # compute test statistic w distributed asympt. as chisquare with df=r
    g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
    w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))

    # apply small-sample correction
    if (max(nx, ny) < 25):
        corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
        w = corr * w

    p = chi2.sf(w, r)

    return Epps_Singleton_2sampResult(w, p)
    sample[2] = np.random.laplace(0, scale_laplace, sample_size)
    sample[3] = np.random.standard_cauchy(size=sample_size)
    sample[4] = np.concatenate([
        np.random.standard_normal(size=mixed_size_a),
        np.random.normal(0, scale_mixed, mixed_size_b)
    ])

    for i in range(5):
        sample[i] = np.sort(sample[i])

    # Calculate all sums of characteristics of samples
    for i in range(5):
        mean_square_dev[i] += mean_square(sample[i])
        average_absolute_dev[i] += average_absolute(sample[i])
        average_range[i] += av_range(sample[i])
        inter_quartile_range[i] += stats.iqr(sample[i])
        median_absolute_dev[i] += median_absolute_deviation(sample[i])

        mean_square_dev_square[i] += pow(mean_square(sample[i]), 2)
        average_absolute_dev_square[i] += pow(average_absolute(sample[i]), 2)
        average_range_square[i] += pow(av_range(sample[i]), 2)
        inter_quartile_range_square[i] += pow(stats.iqr(sample[i]), 2)
        median_absolute_dev_square[i] += pow(
            median_absolute_deviation(sample[i]), 2)

print_mean_result('s', mean_square_dev, mean_square_dev_square)
print_mean_result('d', average_absolute_dev, average_absolute_dev_square)
print_mean_result('R', average_range, average_range_square)
print_mean_result('IQR', inter_quartile_range, inter_quartile_range_square)
print_mean_result('MAD', median_absolute_dev, median_absolute_dev_square)
Пример #9
0
    def rmse(self, ground_truth, simulation, join='inner', fill_value=0,
        relative=False, cumulative=False, normed=False):
        """
        Metric: rmse

        Description: Root mean squared error

        Inputs:
        ground_truth - ground truth measurement (data frame) with measurement in
            the "value" column
        simulation - simulation measurement (data frame) with measurement in the
            "value" column
        join - type of join to perform between ground truth and simulation
        fill_value - fill value for non-overlapping joins
        """

        if type(ground_truth) is np.ndarray:
            result = ground_truth - simulation
            result = (result ** 2).mean()
            result = np.sqrt(result)
            return result

        if type(ground_truth) is list:

            ground_truth = np.nan_to_num(ground_truth)
            simulation   = np.nan_to_num(simulation)

            result = np.asarray(ground_truth) - np.asarray(simulation)
            result = (result ** 2).mean()
            result = np.sqrt(result)

            return result

        df = self.join_dfs(ground_truth, simulation, join=join,
            fill_value=fill_value)


        if len(df.index) > 0:

            if cumulative:
                df['value_sim'] = df['value_sim'].cumsum()
                df['value_gt'] = df['value_gt'].cumsum()

            if normed:
                epsilon = 0.001*df[df['value_gt'] != 0.0]['value_gt'].min()
                df['value_sim'] = (df['value_sim'] + epsilon)/(df['value_sim'].max() + epsilon)
                df['value_gt'] = (df['value_gt'] + epsilon)/(df['value_gt'].max() + epsilon)

            if not relative:
                return np.sqrt(((df["value_sim"]-df["value_gt"])**2).mean())
            else:
                iq_range = float(iqr(df['value_gt'].values))

                result = df["value_sim"]-df["value_gt"]
                result = (result ** 2).mean()
                result = np.sqrt(result)

                if iq_range > 0:
                    result = result / iq_range
                else:
                    mean_value = df['value_gt'].mean()
                    if mean_value > 0:
                        result = result / mean_value
                    else:
                        return None

                return result
        else:
            return None
A_po = evok_po.data
A_pr = evok_pr.data

# compute cohen d
M1 = np.mean(A_po, axis=1)
M2 = np.mean(A_pr, axis=1)
std1 = np.std(A_po, axis=1)
std2 = np.std(A_pr, axis=1)
n1 = A_po.shape[1]
n2 = A_pr.shape[1]
std = np.sqrt(np.divide((n1-1)*std1**2+(n2-1)*std2**2,(n1+n2-2)))
cohen = np.divide(M1-M2, std)

# Compute number of bins

iqr = spstats.iqr(M1)
n = M1.size 
maximum = np.max(M1)
minimum = np.min(M1)
h = 2*iqr/(n**(1/3))
nbin = (maximum - minimum)/h

from numpy import inf
M1[M1==-inf]=0
M2[M2 == -inf] =0
plt.hist(M2, bins = 80)
plt.xlabel('Mean normalised Amplitude (dB)')
plt.ylabel('Number of electrodes')
plt.title('Prestimulus mean stimulus related HFB amplitude (-400 to -100 ms) ')
plt.hist(M1, bins = 80)
plt.xlabel('Mean normalised Amplitude (dB)')
#plot scatter graph
plt.figure()
plt.scatter(x, norm_y, color='black')
plt.title("Values over Time to Identify Outliers")
plt.xlabel("Data Reading (Time)")
plt.ylabel("0-1 Normalised Value")
plt.grid(True)

#compute averages
len = len(y)
mean = np.mean(y)
median = np.median(y)
mode = stats.mode(y)[0][0]
mode_count = stats.mode(y)[1][0]
range = np.max(y) - np.min(y)
iqrange = stats.iqr(y)
std_dev = np.std(y)
z_score = stats.zscore(y)
std_err = stats.sem(y)
con_inter = stats.bayes_mvs(y, alpha=0.95)
#95% confidence interval for mean, var, and std reported as (center, (lower, upper))

#plot z scores scatter graph
plt.figure()
plt.scatter(x, z_score, color='black')
plt.title("Z-Scores of Data Points")
plt.xlabel("Data Reading (Time)")
plt.ylabel("Z-Score")
plt.grid(True)

#print descriptive statistics
Пример #12
0
def get_textual_metadata(annotated_content, size_kb, wsdir, master, idno_file):

    root_document = etree.parse(wsdir + master + idno_file).getroot()
    specific_namespaces = {
        'tei': 'http://www.tei-c.org/ns/1.0',
        'xi': 'http://www.w3.org/2001/XInclude',
        'cligs': 'https://cligs.hypotheses.org/ns/cligs'
    }
    chapters = root_document.xpath("//tei:body//tei:div[@type='chapter']",
                                   namespaces=specific_namespaces)
    len_chapters = []
    for chapter in chapters:
        len_chapters.append(
            len(" ".join(
                chapter.xpath(".//text()", namespaces=specific_namespaces))))
    len_chapters = np.array(len_chapters)

    text_measures = ""

    text_measures = text_measures + '\n\t\t\t\t<measure unit="chapters.len.mean">' + str(
        "%.2f" % round(len_chapters.mean(), 2)) + r'</measure>'
    text_measures = text_measures + '\n\t\t\t\t<measure unit="chapters.len.std">' + str(
        "%.2f" % round(len_chapters.std(), 2)) + r'</measure>'
    text_measures = text_measures + '\n\t\t\t\t<measure unit="chapters.len.median">' + str(
        "%.2f" % round(np.percentile(len_chapters, q=50), 2)) + r'</measure>'
    text_measures = text_measures + '\n\t\t\t\t<measure unit="chapters.len.iqr">' + str(
        "%.2f" % round(stats.iqr(len_chapters), 2)) + r'</measure>'

    content_abstract = re.findall(r'<abstract.*?>(.*?)</abstract>',
                                  annotated_content,
                                  flags=re.DOTALL)[0]
    content_abstract = re.sub(r'</?.*?>',
                              r'',
                              content_abstract,
                              flags=re.DOTALL)
    content_abstract = re.sub(r'\s\s+', r' ', content_abstract)
    len_abstract = str(len(content_abstract))

    annotated_content = re.sub(r'<teiHeader>.*?</teiHeader>',
                               r'',
                               annotated_content,
                               flags=re.DOTALL)

    # Divs and groups of lines are counted
    divs = str(annotated_content.count("<div"))
    lines = str(len(re.findall(r'\n+', annotated_content)))

    # Diferent TEI elements are counted
    chapters = str(
        len(re.findall(r'<div[^>]*?type="chapter"', annotated_content)))
    short_stories = str(
        len(re.findall(r'<div[^>]*?type="shortStories"', annotated_content)))
    parts = str(len(re.findall(r'<div[^>]*?type="part"', annotated_content)))
    sections = str(
        len(re.findall(r'<div[^>]*?type="section"', annotated_content)))
    divisions = str(
        len(re.findall(r'<div[^>]*?type="division"', annotated_content)))
    blocks = str(
        len(
            re.findall(r'<(l|ab|head|stage|sp|p|ab)( .+?|)>',
                       annotated_content)))
    line_verses = str(len(re.findall(r'<(l)( .+?|)>', annotated_content)))
    heads = str(len(re.findall(r'<(head)( .+?|)>', annotated_content)))
    stages = str(len(re.findall(r'<(stage)( .+?|)>', annotated_content)))
    sps = str(len(re.findall(r'<(sp)( .+?|)>', annotated_content)))
    ps = str(len(re.findall(r'<(p)( .+?|)>', annotated_content)))
    abs_ = str(len(re.findall(r'<(ab)( .+?|)>', annotated_content)))
    lg_poems = str(len(re.findall(r'<lg type="poem">', annotated_content)))
    lg_stanzas = str(len(re.findall(r'<lg type="stanza">', annotated_content)))
    ft = str(len(re.findall(r'<(floatingText)( .+?|)>', annotated_content)))
    punctual_ss = str(
        len(re.findall(r'<milestone unit="s"/>', annotated_content)))

    # The paragraphas that have right after a punctuation mark that presents direct speech are counted
    saids = str(len(re.findall(r'<said>', annotated_content)))
    speech_ps = str(
        len(re.findall(r'<p rend="direct-speech">', annotated_content)))
    narrative_ps = str(len(re.findall(r'<p>', annotated_content)))

    # Then the text is converted into plaintext and the white space cleaned
    plain_body = annotated_content
    plain_body = re.sub(r'</?.*?>', r'', plain_body, flags=re.DOTALL)
    plain_body = re.sub(r'[\t ]+', r' ', plain_body)
    plain_body = re.sub(r'\n[\n]+', r'\n', plain_body)

    # Characters and words are counted
    chars = str(len(plain_body))
    tokens = str(len(re.findall(r'[\wáéíóúñü\d]+', plain_body)))

    # If we want some more info, the ammount of numbers and punctuation marks are counted
    numerals = str(len(re.findall(r'\d+', plain_body)))
    puncts = str(
        len(
            re.findall(r'[!"\#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~¿¡…—–~»«]',
                       plain_body)))

    textual_metadata = r'\n\t\t\t\t<measure unit="lines">' + re.escape(
        lines) + r'</measure>\n\t\t\t\t<measure unit="divs">' + re.escape(
            divs
        ) + r'</measure>\n\t\t\t\t<measure unit="tokens">' + re.escape(
            tokens
        ) + r'</measure>\n\t\t\t\t<measure unit="chars">' + re.escape(
            chars
        ) + r'</measure>\n\t\t\t\t<measure unit="size_kb">' + re.escape(
            size_kb
        ) + r'</measure>\n\t\t\t\t<measure unit="chapters">' + re.escape(
            chapters
        ) + r'</measure>\n\t\t\t\t<measure unit="shortStories">' + re.escape(
            short_stories
        ) + r'</measure>\n\t\t\t\t<measure unit="parts">' + re.escape(
            parts
        ) + r'</measure>\n\t\t\t\t<measure unit="sections">' + re.escape(
            sections
        ) + r'</measure>\n\t\t\t\t<measure unit="divisions">' + re.escape(
            divisions
        ) + r'</measure> \n\t\t\t\t<measure unit="blocks">' + re.escape(
            blocks
        ) + r'</measure> \n\t\t\t\t<measure unit="lg.poems">' + re.escape(
            lg_poems
        ) + r'</measure> \n\t\t\t\t<measure unit="lg.stanzas">' + re.escape(
            lg_stanzas
        ) + r'</measure> \n\t\t\t\t<measure unit="line.verses">' + re.escape(
            line_verses
        ) + r'</measure> \n\t\t\t\t<measure unit="heads">' + re.escape(
            heads
        ) + r'</measure> \n\t\t\t\t<measure unit="stages">' + re.escape(
            stages
        ) + r'</measure> \n\t\t\t\t<measure unit="sps">' + re.escape(
            sps
        ) + r'</measure> \n\t\t\t\t<measure unit="paragraphs">' + re.escape(
            ps
        ) + r'</measure> \n\t\t\t\t<measure unit="abs">' + re.escape(
            abs_
        ) + r'</measure> \n\t\t\t\t<measure unit="fts">' + re.escape(
            ft
        ) + r'</measure>\n\t\t\t\t<measure unit="paragraphs.ds">' + re.escape(
            speech_ps
        ) + r'</measure>\n\t\t\t\t<measure unit="saids">' + re.escape(
            saids
        ) + r'</measure>\n\t\t\t\t<measure unit="narrative.ps">' + re.escape(
            narrative_ps
        ) + r'</measure>\n\t\t\t\t<measure unit="punctual_ss">' + re.escape(
            punctual_ss
        ) + r'</measure> \n\t\t\t\t<measure unit="numerals">' + re.escape(
            numerals
        ) + r'</measure> \n\t\t\t\t<measure unit="puncts">' + re.escape(
            puncts
        ) + r'</measure> \n\t\t\t\t<measure unit="len.abstract">' + re.escape(
            len_abstract) + r'</measure>' + text_measures

    return textual_metadata
Пример #13
0
def run(input, mask, outputfile, verbose, dimensions, svdradius, haralickwindow, binsize,label, extendstats):
  """CoLlAGe captures subtle anisotropic differences in disease pathologies by measuring entropy of co-occurrences of voxel-level gradient orientations on imaging computed within a local neighborhood."""
  
  if input.endswith('.csv'):
      header = ['ID', 'Image', 'Mask', 'svdradius', 'haralickwindow', 'binsize', 'label']
      features_list = []
      list_failed_cases = [['ID', 'Image', 'Mask', 'Error']]
      if dimensions == 2:
        suffix = ''
        for feature in collageradiomics.HaralickFeature:
          if extendstats:
            features_list.extend(['Collage'+feature.name+'Median'+suffix, 'Collage'+feature.name+'IQR'+suffix, 'Collage'+feature.name+'Skewness'+suffix, 'Collage'+feature.name+'Kurtosis'+suffix, 'Collage'+feature.name+'Mean'+suffix, 'Collage'+feature.name+'Variance'+suffix])
          else:
            features_list.extend(['Collage'+feature.name+'Median'+suffix,  'Collage'+feature.name+'Skewness'+suffix, 'Collage'+feature.name+'Kurtosis'+suffix, 'Collage'+feature.name+'Variance'+suffix])
        header.append(features_list)
        output_list = [header]
      else:
        for suffix in ['Theta', 'Phi']:
          for feature in collageradiomics.HaralickFeature:
            if extendstats:
              features_list.extend(['Collage'+feature.name+'Median'+suffix, 'Collage'+feature.name+'IQR'+suffix, 'Collage'+feature.name+'Skewness'+suffix, 'Collage'+feature.name+'Kurtosis'+suffix, 'Collage'+feature.name+'Mean'+suffix, 'Collage'+feature.name+'Variance'+suffix])
            else:
              features_list.extend(['Collage'+feature.name+'Median'+suffix, 'Collage'+feature.name+'Skewness'+suffix, 'Collage'+feature.name+'Kurtosis'+suffix, 'Collage'+feature.name+'Variance'+suffix])
        header.extend(features_list)
        output_list = [header]
      
      with open(input, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
          output_case = []
          try:
            case_id = row['ID']
            image_filepath = row['Image']
            mask_filepath = row['Mask']
            image = sitk.ReadImage(image_filepath)
            mask = sitk.ReadImage(mask_filepath)
            
            output_case.extend([case_id, image_filepath, mask_filepath, svdradius, haralickwindow, binsize, label])
            
            # Check if user wants to select single label from the mask
            if label != -1:
              mask = sitk.BinaryThreshold(mask, lowerThreshold = label, upperThreshold = label, insideValue = 1, outsideValue = 0)
            
            image_array = sitk.GetArrayFromImage(image)
            mask_array = sitk.GetArrayFromImage(mask)
            
            # Collage is expecting array with x,y,z but sitk.GetArrayFromImage as z,y,x, so x show be swapped by z
            if dimensions != 2:
              image_array = np.swapaxes(image_array,0,2)
              mask_array = np.swapaxes(mask_array,0,2)
            
            # Remove any extra array dimensions if the user explicitly asks for 2D.
            if dimensions == 2:
              image_array = image_array[:,:,0]
              mask_array  = mask_array [:,:,0]
            
            collage = collageradiomics.Collage(
              image_array,
              mask_array,
              svd_radius=svdradius,
              verbose_logging=verbose,
              num_unique_angles=binsize)

            collage.execute()
            
            for feature in collageradiomics.HaralickFeature:
              feature_output = collage.get_single_feature_output(feature)
              if image_array.ndim == 2:
                feature_output = feature_output[~np.isnan(feature_output)]

                # NumPy supports median natively, we'll use that.
                median = np.nanmedian(feature_output, axis=None)

                # Use SciPy for kurtosis, variance, and skewness.
                feature_stats = stats.describe(feature_output, axis=None)
                
                if extendstats:
                  mean = feature_stats.mean #np.nanmean(feature_output, axis=None)
                  iqr = stats.iqr(feature_output)
                  
                  output_case.extend([median, iqr, feature_stats.skewness, feature_stats.kurtosis, feature_stats.mean, feature_stats.variance])
                else:
                  output_case.extend([median, feature_stats.skewness, feature_stats.kurtosis, feature_stats.variance])
                
              else:
                # Extract phi and theta angles.
                feature_output_theta = feature_output[:,:,:,0]
                feature_output_phi = feature_output[:,:,:,1]

                # Remove NaN for stat calculations.
                feature_output_theta = feature_output_theta[~np.isnan(feature_output_theta)]
                feature_output_phi = feature_output_phi[~np.isnan(feature_output_phi)]

                # NumPy supports median natively, we'll use that.
                median_theta = np.nanmedian(feature_output_theta, axis=None)
                median_phi = np.nanmedian(feature_output_phi, axis=None)

                # Use SciPy for kurtosis, variance, and skewness.
                feature_stats_theta = stats.describe(feature_output_theta.flatten(), axis=None)
                feature_stats_phi = stats.describe(feature_output_phi.flatten(), axis=None)
                
                if extendstats:
                  mean_theta = feature_stats_theta.mean
                  mean_phi = feature_stats_phi.mean
                  iqr_theta = stats.iqr(feature_output_theta)
                  iqr_phi = stats.iqr(feature_output_phi)
                  
                  output_case.extend([median_theta, iqr_theta, feature_stats_theta.skewness, feature_stats_theta.kurtosis, feature_stats_theta.mean, feature_stats_theta.variance, median_phi, iqr_phi, feature_stats_phi.skewness, feature_stats_phi.kurtosis, feature_stats_phi.mean, feature_stats_phi.variance])
                else:
                  output_case.extend([median_theta, feature_stats_theta.skewness, feature_stats_theta.kurtosis, feature_stats_theta.variance, median_phi, feature_stats_phi.skewness, feature_stats_phi.kurtosis, feature_stats_phi.variance])
            output_list.append(output_case)
          except RuntimeError as err:
            list_failed_cases.append([case_id, image_filepath, mask_filepath, err])
          except ValueError as err:
            list_failed_cases.append([case_id, image_filepath, mask_filepath, err])
      
      # Create collage radiomic features output csv file
      with open(outputfile, 'w') as file:
        writer = csv.writer(file)
        writer.writerows(output_list)
      
      # Create errors output csv file
      with open(os.path.join(os.path.dirname(outputfile), 'errors_' + os.path.basename(outputfile)), 'w') as file:
        writer = csv.writer(file)
        writer.writerows(list_failed_cases)
  else:
      image = sitk.ReadImage(input)
      mask = sitk.ReadImage(mask)
            
      # Check if user wants to select single label from the mask
      if label != -1:
        mask = sitk.BinaryThreshold(mask, lowerThreshold = label, upperThreshold = label, insideValue = 1, outsideValue = 0)

      image_array = sitk.GetArrayFromImage(image)
      mask_array = sitk.GetArrayFromImage(mask)
            
      # Collage is expecting array with x,y,z but sitk.GetArrayFromImage as z,y,x, so x show be swapped by z
      if dimensions != 2:
        image_array = np.swapaxes(image_array,0,2)
        mask_array = np.swapaxes(mask_array,0,2)

      # Remove any extra array dimensions if the user explicitly asks for 2D.
      if dimensions == 2:
        image_array = image_array[:,:,0]
        mask_array  = mask_array [:,:,0]

      collage = collageradiomics.Collage(
        image_array,
        mask_array,
        svd_radius=svdradius,
        verbose_logging=verbose,
        num_unique_angles=binsize)

      collage.execute()

      # Create a csv file at the passed in output file location.
      with open(outputfile, 'w', newline='') as csv_output_file:
        writer = csv.writer(csv_output_file)

        # Write the columns.
        writer.writerow(['FeatureName', 'Value'])
        for feature in collageradiomics.HaralickFeature:
          feature_output = collage.get_single_feature_output(feature)
          if image_array.ndim == 2:
            feature_output = feature_output[~np.isnan(feature_output)]

            # NumPy supports median natively, we'll use that.
            median = np.nanmedian(feature_output, axis=None)

            # Use SciPy for kurtosis, variance, and skewness.
            feature_stats = stats.describe(feature_output, axis=None)

            # Write CSV row for current feature.
            _write_csv_stats_row(writer, feature, median, feature_stats.skewness, feature_stats.kurtosis, feature_stats.variance)
          else:
            # Extract phi and theta angles.
            feature_output_theta = feature_output[:,:,:,0]
            feature_output_phi = feature_output[:,:,:,1]

            # Remove NaN for stat calculations.
            feature_output_theta = feature_output_theta[~np.isnan(feature_output_theta)]
            feature_output_phi = feature_output_phi[~np.isnan(feature_output_phi)]

            # NumPy supports median natively, we'll use that.
            median_theta = np.nanmedian(feature_output_theta, axis=None)
            median_phi = np.nanmedian(feature_output_phi, axis=None)

            # Use SciPy for kurtosis, variance, and skewness.
            feature_stats_theta = stats.describe(feature_output_theta.flatten(), axis=None)
            feature_stats_phi = stats.describe(feature_output_phi.flatten(), axis=None)
            
            if extendstats:
                mean_phi = feature_stats_phi.mean
                iqr_phi = stats.iqr(feature_output_phi.flatten())
                
                mean_theta = feature_stats_theta.mean
                iqr_theta = stats.iqr(feature_output_theta.flatten())
                
                _write_csv_extented_stats_row(writer, feature, median_theta, iqr_theta, feature_stats_theta.skewness, feature_stats_theta.kurtosis, mean_theta, feature_stats_theta.variance, 'Theta')
                _write_csv_extented_stats_row(writer, feature, median_phi, iqr_phi, feature_stats_phi.skewness, feature_stats_phi.kurtosis, mean_phi, feature_stats_phi.variance, 'Phi')
            else:
                # Write CSV rows for each angle.
                _write_csv_stats_row(writer, feature, median_theta, feature_stats_theta.skewness, feature_stats_theta.kurtosis, feature_stats_theta.variance, 'Theta')
                _write_csv_stats_row(writer, feature, median_phi, feature_stats_phi.skewness, feature_stats_phi.kurtosis, feature_stats_phi.variance, 'Phi')
    recall = float(recall)

    # calculate F1 score
    if (precision + recall) != 0:
        f1 = (2 * precision * recall) / (precision + recall)
        f1_scores.append(f1)

# Calculate cross-validation average
print('\n-----------------------------------')
print('sklearn.tree.DecisionTreeClassifier Model 1')
print('\tFeatures: speed, X-accel, Y-accel, Z-accel, Z-jolt')
print('\tLabels: speedbump (1 = yes, 0 = no)')
print('\tAverage F1 score:', np.mean(f1_scores))
print('\tStdDev F1 score:', np.std(f1_scores))
print('\tMedian F1 score:', np.median(f1_scores))
print('\tIQR F1 score:', stats.iqr(f1_scores))
print('\tSkewness F1 score:', stats.skew(f1_scores))
print('\tZero F1 score:', f1_scores.count(0.00))

# # Decision Model Model 2
# # Separate Y and X variables
# df_label = df.loc[:, 'speedbump']
# df_feature = df.loc[:, ('Speed', 'X', 'Y', 'Z')]
# Y = df_label.as_matrix()
# X = df_feature.as_matrix()
#
#
# # Prepare for cross-validation
# clf = DecisionTreeClassifier(random_state=0)  # create a DecisionTreeClassifier
# f1_scores = []  # sum of F1 scores
# cv = 100  # number of cross-validations
Пример #15
0
def _demo_validate_data():
    dim_x = 75
    num_x_p = 500
    num_x_n = 500

    num_ch = 20

    x_p_train = np.asarray(
        [np.random.randn(num_x_p, dim_x) for i in range(num_ch)])
    x_n_train = np.array(
        [np.random.randn(num_x_p, dim_x) for i in range(num_ch)])
    y_p_train = [1] * num_x_p
    y_n_train = [0] * num_x_n

    x_train = np.concatenate((x_n_train, x_p_train), axis=1)
    y_train = np.concatenate((y_n_train, y_p_train), axis=0)

    permutation = np.random.permutation(x_train.shape[1])
    x_train = x_train[:, permutation, :]
    y_train = y_train[permutation]

    model = train_pca_rda_kde_model(x_train, y_train, k_folds=10)

    fig = plt.figure()
    ax = fig.add_subplot(211)
    x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]),
                         1000)[:, np.newaxis]
    ax.plot(model.line_el[2][y_train == 0],
            -0.005 -
            0.01 * np.random.random(model.line_el[2][y_train == 0].shape[0]),
            'ro',
            label='class(-)')
    ax.plot(model.line_el[2][y_train == 1],
            -0.005 -
            0.01 * np.random.random(model.line_el[2][y_train == 1].shape[0]),
            'go',
            label='class(+)')
    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                'r-' * (idx == 0) + 'g-' * (idx == 1),
                linewidth=2.0)

    ax.legend(loc='upper right')
    plt.title('Training Data')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')

    # Test
    x_p_test = np.asarray(
        [np.random.randn(num_x_p, dim_x) for i in range(num_ch)])
    x_n_test = np.array(
        [np.random.randn(num_x_p, dim_x) for i in range(num_ch)])
    y_p_test = [1] * num_x_p
    y_n_test = [0] * num_x_n

    x_test = np.concatenate((x_n_test, x_p_test), axis=1)
    y_test = np.concatenate((y_n_test, y_p_test), axis=0)

    permutation = np.random.permutation(x_test.shape[1])
    x_test = x_test[:, permutation, :]
    y_test = y_test[permutation]

    model.transform(x_test)

    ax.plot(model.line_el[2][y_test == 0],
            -0.01 -
            0.01 * np.random.random(model.line_el[2][y_test == 0].shape[0]),
            'bo',
            label='t_class(-)')
    ax.plot(model.line_el[2][y_test == 1],
            -0.01 -
            0.01 * np.random.random(model.line_el[2][y_test == 1].shape[0]),
            'ko',
            label='t_class(+)')

    bandwidth = 1.06 * min(np.std(model.line_el[2]),
                           iqr(model.line_el[2]) / 1.34) * np.power(
                               model.line_el[2].shape[0], -0.2)
    test_kde = KernelDensityEstimate(bandwidth=bandwidth)
    test_kde.fit(model.line_el[2], y_test)

    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = test_kde.list_den_est[idx].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                'b--' * (idx == 0) + 'k--' * (idx == 1),
                linewidth=2.0)

    ax.legend(loc='upper right')
    plt.title('Training Data')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')
    plt.show()
Пример #16
0
def Slice_Profile(IMG, results, options):
    """Extract a very basic SB profile along a line.

    A line of pixels can be identified by the user in image
    coordinates to extract an SB profile. Primarily intended for
    diagnostic purposes, this allows users to see very specific
    pixels. While this tool can be used for examining the disk
    structure (such as for edge on galaxies), users will likely prefer
    the more powerful
    :func:`~pipeline_steps.Axial_Profiles.Axial_Profiles` and
    :func:`~pipeline_steps.Radial_Profiles.Radial_Profiles` methods
    for such analysis.

    Parameters
    -----------------
    ap_slice_anchor : dict, default None
      Coordinates for the starting point of the slice as a dictionary
      formatted "{'x': x-coord, 'y': y-coord}" in pixel units.

    ap_slice_pa : float, default None
      Position angle of the slice in degrees, counter-clockwise
      relative to the x-axis.

    ap_slice_length : float, default None
      Length of the slice from anchor point in pixel units. By
      default, use init ellipse semi-major axis length

    ap_slice_width : float, default 10
      Width of the slice in pixel units.

    ap_slice_step : float, default None
      Distance between samples for the profile along the
      slice. By default use the PSF.

    ap_isoaverage_method : string, default 'median'
      Select the method used to compute the averafge flux along an
      isophote. Choose from 'mean', 'median', and 'mode'.  In general,
      median is fast and robust to a few outliers. Mode is slow but
      robust to more outliers. Mean is fast and accurate in low S/N
      regimes where fluxes take on near integer values, but not robust
      to outliers. The mean should be used along with a mask to remove
      spurious objects such as foreground stars or galaxies, and
      should always be used with caution.

    ap_saveto : string, default None
      Directory in which to save profile

    ap_name : string, default None
      Name of the current galaxy, used for making filenames.

    ap_zeropoint : float, default 22.5
      Photometric zero point. For converting flux to mag units.

    Notes
    ----------
    :References:
    - 'background' (optional)
    - 'background noise' (optional)
    - 'center' (optional)
    - 'init R' (optional)
    - 'init pa' (optional)

    Returns
    -------
    IMG : ndarray
      Unaltered galaxy image

    results : dict
      .. code-block:: python

        {}

    """

    dat = IMG - (results["background"] if "background" in results else np.median(IMG))
    zeropoint = options["ap_zeropoint"] if "ap_zeropoint" in options else 22.5

    use_anchor = (
        results["center"]
        if "center" in results
        else {"x": IMG.shape[1] / 2, "y": IMG.shape[0] / 2}
    )
    if "ap_slice_anchor" in options:
        use_anchor = options["ap_slice_anchor"]
    else:
        logging.warning(
            "%s: ap_slice_anchor not specified by user, using: %s"
            % (options["ap_name"], str(use_anchor))
        )

    use_pa = results["init pa"] if "init pa" in results else 0.0
    if "ap_slice_pa" in options:
        use_pa = options["ap_slice_pa"] * np.pi / 180
    else:
        logging.warning(
            "%s: ap_slice_pa not specified by user, using: %.2f"
            % (options["ap_name"], use_pa)
        )

    use_length = results["init R"] if "init R" in results else min(IMG.shape)
    if "ap_slice_length" in options:
        use_length = options["ap_slice_length"]
    else:
        logging.warning(
            "%s: ap_slice_length not specified by user, using: %.2f"
            % (options["ap_name"], use_length)
        )

    use_width = 10.0
    if "ap_slice_width" in options:
        use_width = options["ap_slice_width"]
    else:
        logging.warning(
            "%s: ap_slice_width not specified by user, using: %.2f"
            % (options["ap_name"], use_width)
        )

    use_step = (
        results["psf fwhm"] if "psf fwhm" in results else max(2.0, use_length / 100)
    )
    if "ap_slice_step" in options:
        use_step = options["ap_slice_step"]
    else:
        logging.warning(
            "%s: ap_slice_step not specified by user, using: %.2f"
            % (options["ap_name"], use_step)
        )

    F, X = _iso_line(dat, use_length, use_width, use_pa, use_anchor, more=False)

    windows = np.arange(0, use_length, use_step)

    R = (windows[1:] + windows[:-1]) / 2
    sb = []
    sb_e = []
    sb_sclip = []
    sb_sclip_e = []
    for i in range(len(windows) - 1):
        isovals = F[np.logical_and(X >= windows[i], X < windows[i + 1])]
        isovals_sclip = Sigma_Clip_Upper(isovals, iterations=10, nsigma=5)

        medflux = _average(
            isovals,
            options["ap_isoaverage_method"]
            if "ap_isoaverage_method" in options
            else "median",
        )
        scatflux = _scatter(
            isovals,
            options["ap_isoaverage_method"]
            if "ap_isoaverage_method" in options
            else "median",
        )
        medflux_sclip = _average(
            isovals_sclip,
            options["ap_isoaverage_method"]
            if "ap_isoaverage_method" in options
            else "median",
        )
        scatflux_sclip = _scatter(
            isovals_sclip,
            options["ap_isoaverage_method"]
            if "ap_isoaverage_method" in options
            else "median",
        )

        sb.append(
            flux_to_sb(medflux, options["ap_pixscale"], zeropoint)
            if medflux > 0
            else 99.999
        )
        sb_e.append(
            (2.5 * scatflux / (np.sqrt(len(isovals)) * medflux * np.log(10)))
            if medflux > 0
            else 99.999
        )
        sb_sclip.append(
            flux_to_sb(medflux_sclip, options["ap_pixscale"], zeropoint)
            if medflux_sclip > 0
            else 99.999
        )
        sb_sclip_e.append(
            (
                2.5
                * scatflux_sclip
                / (np.sqrt(len(isovals)) * medflux_sclip * np.log(10))
            )
            if medflux_sclip > 0
            else 99.999
        )

    with open(
        "%s%s_slice_profile.prof"
        % (
            (options["ap_saveto"] if "ap_saveto" in options else ""),
            options["ap_name"],
        ),
        "w",
    ) as f:
        f.write(
            "# flux sum: %f\n" % (np.sum(F[np.logical_and(X >= 0, X <= use_length)]))
        )
        f.write(
            "# flux mean: %f\n"
            % (_average(F[np.logical_and(X >= 0, X <= use_length)], "mean"))
        )
        f.write(
            "# flux median: %f\n"
            % (_average(F[np.logical_and(X >= 0, X <= use_length)], "median"))
        )
        f.write(
            "# flux mode: %f\n"
            % (_average(F[np.logical_and(X >= 0, X <= use_length)], "mode"))
        )
        f.write(
            "# flux std: %f\n" % (np.std(F[np.logical_and(X >= 0, X <= use_length)]))
        )
        f.write(
            "# flux 16-84%% range: %f\n"
            % (iqr(F[np.logical_and(X >= 0, X <= use_length)], rng=[16, 84]))
        )
        f.write("R,sb,sb_e,sb_sclip,sb_sclip_e\n")
        f.write("arcsec,mag*arcsec^-2,mag*arcsec^-2,mag*arcsec^-2,mag*arcsec^-2\n")
        for i in range(len(R)):
            f.write(
                "%.4f,%.4f,%.4f,%.4f,%.4f\n"
                % (
                    R[i] * options["ap_pixscale"],
                    sb[i],
                    sb_e[i],
                    sb_sclip[i],
                    sb_sclip_e[i],
                )
            )

    if "ap_doplot" in options and options["ap_doplot"]:
        CHOOSE = np.array(sb_e) < 0.5
        plt.errorbar(
            np.array(R)[CHOOSE] * options["ap_pixscale"],
            np.array(sb)[CHOOSE],
            yerr=np.array(sb_e)[CHOOSE],
            elinewidth=1,
            linewidth=0,
            marker=".",
            markersize=3,
            color="r",
        )
        plt.xlabel("Position on line [arcsec]", fontsize=16)
        plt.ylabel("Surface Brightness [mag arcsec$^{-2}$]", fontsize=16)
        if "background noise" in results:
            bkgrdnoise = (
                -2.5 * np.log10(results["background noise"])
                + zeropoint
                + 2.5 * np.log10(options["ap_pixscale"] ** 2)
            )
            plt.axhline(
                bkgrdnoise,
                color="purple",
                linewidth=0.5,
                linestyle="--",
                label="1$\\sigma$ noise/pixel: %.1f mag arcsec$^{-2}$" % bkgrdnoise,
            )
        plt.gca().invert_yaxis()
        plt.legend(fontsize=15)
        plt.tick_params(labelsize=14)
        plt.tight_layout()
        if not ("ap_nologo" in options and options["ap_nologo"]):
            AddLogo(plt.gcf())
        plt.savefig(
            "%sslice_profile_%s.jpg"
            % (
                options["ap_plotpath"] if "ap_plotpath" in options else "",
                options["ap_name"],
            ),
            dpi=options["ap_plotdpi"] if "ap_plotdpi" in options else 300,
        )
        plt.close()

        ranges = [
            [
                max(
                    0,
                    int(
                        use_anchor["x"]
                        + 0.5 * use_length * np.cos(use_pa)
                        - use_length * 0.7
                    ),
                ),
                min(
                    IMG.shape[1],
                    int(
                        use_anchor["x"]
                        + 0.5 * use_length * np.cos(use_pa)
                        + use_length * 0.7
                    ),
                ),
            ],
            [
                max(
                    0,
                    int(
                        use_anchor["y"]
                        + 0.5 * use_length * np.sin(use_pa)
                        - use_length * 0.7
                    ),
                ),
                min(
                    IMG.shape[0],
                    int(
                        use_anchor["y"]
                        + 0.5 * use_length * np.sin(use_pa)
                        + use_length * 0.7
                    ),
                ),
            ],
        ]
        LSBImage(
            dat[ranges[1][0] : ranges[1][1], ranges[0][0] : ranges[0][1]],
            results["background noise"]
            if "background noise" in results
            else iqr(dat, rng=(31.731 / 2, 100 - 31.731 / 2)) / 2,
        )

        XX, YY = np.meshgrid(
            np.arange(ranges[0][1] - ranges[0][0], dtype=float),
            np.arange(ranges[1][1] - ranges[1][0], dtype=float),
        )
        XX -= use_anchor["x"] - float(ranges[0][0])
        YY -= use_anchor["y"] - float(ranges[1][0])
        XX, YY = (
            XX * np.cos(-use_pa) - YY * np.sin(-use_pa),
            XX * np.sin(-use_pa) + YY * np.cos(-use_pa),
        )
        ZZ = np.ones(XX.shape)
        ZZ[
            np.logical_not(
                np.logical_and(
                    np.logical_and(YY <= use_width / 2, YY >= -use_width / 2),
                    np.logical_and(XX >= 0, XX <= use_length),
                )
            )
        ] = np.nan
        plt.imshow(ZZ, origin="lower", cmap="Reds_r", alpha=0.6)
        plt.tight_layout()
        if not ("ap_nologo" in options and options["ap_nologo"]):
            AddLogo(plt.gcf())
        plt.savefig(
            "%sslice_profile_window_%s.jpg"
            % (
                options["ap_plotpath"] if "ap_plotpath" in options else "",
                options["ap_name"],
            ),
            dpi=options["ap_plotdpi"] if "ap_plotdpi" in options else 300,
        )
        plt.close()

    return IMG, {}
Пример #17
0
def _demo_validate_real_data():
    ds_rate = 2
    channel_map = [1] * 16 + [0, 0, 1, 1, 0, 1, 1, 1, 0]
    data_train_folder = load_experimental_data()

    mode = 'calibration'

    raw_dat, stamp_time, channels, type_amp, fs = read_data_csv(
        data_train_folder + '/rawdata.csv')

    dat = sig_pro(raw_dat, fs=fs, k=ds_rate)

    # Get data and labels
    s_i, t_t_i, t_i = trigger_decoder(mode=mode,
                                      trigger_loc=data_train_folder +
                                      '/triggers.txt')
    x_train, y_train, num_seq, _ = trial_reshaper(t_t_i,
                                                  t_i,
                                                  dat,
                                                  mode=mode,
                                                  fs=fs,
                                                  k=ds_rate,
                                                  channel_map=channel_map)

    model = train_pca_rda_kde_model(x_train, y_train, k_folds=10)

    fig = plt.figure()
    ax = fig.add_subplot(211)
    x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]),
                         1000)[:, np.newaxis]
    ax.plot(model.line_el[2][y_train == 0],
            -0.005 -
            0.01 * np.random.random(model.line_el[2][y_train == 0].shape[0]),
            'ro',
            label='class(-)')
    ax.plot(model.line_el[2][y_train == 1],
            -0.005 -
            0.01 * np.random.random(model.line_el[2][y_train == 1].shape[0]),
            'go',
            label='class(+)')
    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                'r-' * (idx == 0) + 'g-' * (idx == 1),
                linewidth=2.0)

    ax.legend(loc='upper right')
    plt.title('Training Data')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')

    # Test
    data_test_folder = load_experimental_data()

    mode = 'calibration'

    raw_dat, stamp_time, channels, type_amp, fs = read_data_csv(
        data_test_folder + '/rawdata.csv')
    dat = sig_pro(raw_dat, fs=fs, k=ds_rate)

    # Get data and labels
    s_i, t_t_i, t_i = trigger_decoder(mode=mode,
                                      trigger_loc=data_test_folder +
                                      '/triggers.txt')
    x_test, y_test, num_seq, _ = trial_reshaper(t_t_i,
                                                t_i,
                                                dat,
                                                mode=mode,
                                                fs=fs,
                                                k=ds_rate,
                                                channel_map=channel_map)

    model.transform(x_test)

    ax.plot(model.line_el[2][y_test == 0],
            -0.01 -
            0.01 * np.random.random(model.line_el[2][y_test == 0].shape[0]),
            'bo',
            label='t_class(-)')
    ax.plot(model.line_el[2][y_test == 1],
            -0.01 -
            0.01 * np.random.random(model.line_el[2][y_test == 1].shape[0]),
            'ko',
            label='t_class(+)')

    bandwidth = 1.06 * min(np.std(model.line_el[2]),
                           iqr(model.line_el[2]) / 1.34) * np.power(
                               model.line_el[2].shape[0], -0.2)
    test_kde = KernelDensityEstimate(bandwidth=bandwidth)
    test_kde.fit(model.line_el[2], y_test)

    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = test_kde.list_den_est[idx].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                'b--' * (idx == 0) + 'k--' * (idx == 1),
                linewidth=2.0)

    ax.legend(loc='upper right')
    plt.title('Training Data')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')

    plt.show()
Пример #18
0
fSports = []
fMusic = []
fGames = []

fBSD = []
fRSD = []
fTSD = []
fThSD = []

fRN = []
fBN = []
fTN = []
fThN = []

for i in range(0, len(RA)):
    if BA[i] < np.percentile(BA, 75) + 1.5 * iqr(BA) and BA[i] > np.percentile(
            BA, 25) - 1.5 * iqr(BA) and NSD(0)[i] < np.percentile(
                NSD(0), 75) + 1.5 * iqr(NSD(0)) and NSD(0)[i] > np.percentile(
                    NSD(0), 25) - 1.5 * iqr(NSD(0)) and RA[i] < np.percentile(
                        RA, 75) + 1.5 * iqr(RA) and RA[i] > np.percentile(
                            RA, 25
                        ) - 1.5 * iqr(RA) and NSD(1)[i] < np.percentile(
                            NSD(1), 75
                        ) + 1.5 * iqr(NSD(1)) and NSD(1)[i] > np.percentile(
                            NSD(1), 25
                        ) - 1.5 * iqr(NSD(1)) and TA[i] < np.percentile(
                            TA, 75) + 1.5 * iqr(TA) and TA[i] > np.percentile(
                                TA, 25
                            ) - 1.5 * iqr(TA) and NSD(2)[i] < np.percentile(
                                NSD(2), 75) + 1.5 * iqr(NSD(2)) and NSD(
                                    2
Пример #19
0
import numpy as np
from scipy.stats import iqr

# 1.1 'd' has an outlier
d = np.array([2, 4, 6, 8, 10, 12, 14, 16, 40])
d = d.reshape(-1, 1)
d

# 1.2 Transform using RobustScaler
rs = RobustScaler()
print("\n1.0 RobustScaler result:\n\n", rs.fit_transform(d))
print()

# 1.3 Calculate manually:
MEDIAN = np.median(d)
IQR = iqr(d)
print("2.0 Manual calculations result\n\n",
      (d - MEDIAN) / IQR)  # Result same as by RobustSclaer
print("Both the above results are same.")
print("===============================")

############
print("\nNext, remove outlier and see results")
# 2.0 Remove outlier and see results
d1 = np.array([2, 4, 6, 8, 10, 12, 14, 16])
d1 = d1.reshape(-1, 1)
d1

# 2.1 Transform using RobustScaler
rs = RobustScaler()
print("\nRobustScaler result with outlier removed:\n", rs.fit_transform(d1))
Пример #20
0
        if v.mean().values[0] > best:
            best = v.mean().values[0]
            abest = (k, v['Harmonic mean'])
    for k, v in __.groupby('Algorithms'):
        # print(abest[0])
        if k != abest[0]:
            from scipy import stats

            s, p = stats.ttest_ind(abest[1], v['Harmonic mean'].values)
            if (p <= 0.05):
                print("*T-Test:", abest[0], k, s, p)
            else:
                print("T-Test:", abest[0], k, s, p)
from scipy.stats import ranksums as kruskal
from scipy.stats import iqr
plot(dfh, 'Harmonic mean', xcl)
for _, __ in dfh.groupby(['DataSet']):
    print(_)
    for k1, v1 in __.groupby('Algorithms'):
        print(k1, 'mdn:%0.2f' % v1['Harmonic mean'].median(),
              'iqr:%0.2f' % iqr(v1['Harmonic mean']))
        for k, v in __.groupby('Algorithms'):
            s, p = kruskal(v1['Harmonic mean'], v['Harmonic mean'])
            print('mdn:%0.2f' % v['Harmonic mean'].median(),
                  'iqr:%0.2f' % iqr(v['Harmonic mean']), 'p:%0.5f' % p,
                  's:%0.5f' % s, _, k, k1)
for _, __ in dfh.groupby(['DataSet']):
    print(_)
    for k1, v1 in __.groupby('Algorithms'):
        print(k1, 'mdn:%0.2f' % v1['Harmonic mean'].median(),
              'iqr:%0.2f' % iqr(v1['Harmonic mean']))
    def __getitem__(self, idx):
        idx = idx % self.img_fn_len
        
        #img_fn = self.img_fns[idx]
        #img = Image.open(img_fn)
        bad_img_counts = 0
        while True:
            try:
                img_fn = self.img_fns[idx]
                img = Image.open(img_fn)
                img_np = np.asarray(img)
                if img_np is None or np.prod(list(img_np.shape)) == 0:
                    idx += 1
                    idx = idx % self.img_fn_len
                    bad_img_counts += 1
                    print("Bad data1: {}\n".format(img_fn))
                    continue
                if bad_img_counts > 100:
                    break

                break
            except:
               idx += 1
               bad_img_counts += 1
               idx = idx % self.img_fn_len
               print("Bad data2: {}\n".format(img_fn))
               if bad_img_counts > 100:
                   break

        img = img.resize((1024,512))
        width, height = img.size
        gray = np.asarray(img.convert('L'))
        img = transforms.functional.to_tensor(img)
        bg_image = img.detach().clone()

        
        p_neg = np.random.rand() > 0.05
        og_overlay_image = None
        if p_neg:
            if self.debug_train:
                overlay_fn = self.overlay_fns[self.rn]
            else:
                overlay_fn = self.overlay_fns[randint(0, self.n_overlays-1)]
            overlay_img = Image.open(overlay_fn).convert('L')
            og_overlay_image = transforms.functional.to_tensor(overlay_img)
            spatial_augs = transforms.Compose([
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.RandomAffine(degrees=60, translate=(.3, .1), scale=(.25, .5), fillcolor=(255)),
                transforms.Resize((height, width)),
            ] if not self.debug_train else [transforms.Resize((height, width))])

            overlay_img = spatial_augs(overlay_img)
            overlay_img = 1.0 - transforms.functional.to_tensor(overlay_img)

            # create mask according to iqr and add color jitter
            iqr_val = iqr(overlay_img[overlay_img > 0])
            mask = (overlay_img >= iqr_val).squeeze().float()

            overlay_img = transforms.functional.to_pil_image(overlay_img)

            # cj = transforms.ColorJitter(brightness=(0.8, 1.2), saturation=(0.8, 1.0))
            # overlay_img = cj(overlay_img)
            
            mean_gray = norm(gray)
            
            spatial_augs2 = transforms.Compose([
                transforms.Grayscale(),
                transforms.ToTensor()
            ])

            ### DEBUG
            overlay_img = spatial_augs2(overlay_img).squeeze().numpy()
            mult = np.power(mean_gray, 0.2) #np.ones_like(mean_gray) 
            final_img2 = transforms.functional.to_tensor(np.multiply(mult, overlay_img)).float()
            ### DEBUG
            
            img[:, mask==1] = final_img2[0, mask==1]
        else:
            mask = torch.zeros_like(img[0, :, :])
            
        img_spatial_augs = transforms.Compose([
            transforms.ToPILImage(),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomAffine(degrees=0, translate=(0.01, 0.01)),
            transforms.CenterCrop((height-40, width-60)),
            transforms.Resize((height, width)),
        ] if not self.debug_train else [transforms.CenterCrop((height-40, width-60)),
                                        transforms.Resize((height, width))])

        # use same random seed for both augmentations
        seed = np.random.randint(2147483647)

        random.seed(seed)
        torch.manual_seed(seed=seed)
        img = img_spatial_augs(img)

        random.seed(seed)
        torch.manual_seed(seed=seed)
        mask = img_spatial_augs(mask)

        img_color_augs = transforms.Compose([
            transforms.ColorJitter(brightness=(0.7, 1.3), saturation=(0.7, 1.2), contrast=(0.8, 1.2)),
            transforms.ToTensor()
        ] if not self.debug_train else [transforms.ToTensor()])

        img = img_color_augs(img)
        mask = transforms.functional.to_tensor(mask)
        #if img is None or mask is None or og_overlay_image is None or idx is None:
            #print("image is None or mask is none")
        output_dict = {
            "idx": idx,
            "input_img": img,
            "target_mask": mask,
            "bg": bg_image,
            "fg": og_overlay_image
        } 
        return output_dict
Пример #22
0
import scipy.stats as stats
import numpy as np

#%% data import
with open('population_countries.csv', 'r', encoding="utf-8",
          errors="ignore") as fin:
    datarray = listmaker(fin)

#%% Calculating Descriptive Stats

pop_arith_mean = np.mean(datarray)
pop_geo_mean = stats.gmean(datarray)
pop_median = np.median(datarray)
pop_mode = stats.mode(datarray)
pop_std = np.std(datarray)
pop_iqr = stats.iqr(datarray)
pop_skew = stats.skew(datarray)
pop_num_miss = sum(np.isnan(datarray))

#%% Plotting Other Stats

binlist = np.logspace(np.log(1000), np.log10(20000000000), 20)

plt.xkcd()
_ = plt.hist(datarray, bins=binlist)
_ = plt.xscale("log")
_ = plt.xlabel("Number of Inhabitants")
plt.show()

_ = plt.boxplot(datarray)
_ = plt.yscale("log")
    x = slider1.slide()
    fft_x = abs(np.fft.rfft(x))
    n = fft_x.size

    #sample_rate = nbrOfSample/recordingTime
    fft_x_freq = np.fft.rfftfreq(x.size, d=1. / sample_rate)

    #Calculate values to cvs-file
    meanx = st.mean(fft_x)
    mad1x = pd.Series(fft_x)
    madx = mad1x.mad()
    maxx = max(fft_x)
    #print("max", maxx)
    minx = min(fft_x)
    stdx = st.stdev(fft_x)
    iqx = iqr(fft_x)
    #Calculate signal entropy
    sx = pd.Series(fft_x)
    vectorx = (sx.groupby(sx).transform('count') / len(sx)).values
    entrox = entropy(vectorx)
    #Calculate signal energy
    resx = sum(map(lambda i: i * i, fft_x))
    energyx = resx / (fft_x.size)

    #Calculate SMA
    integralx = np.trapz(fft_x, dx=timediff)
    t = timediff * (fft_x.size)
    SMA = (1 / t) * (integralx)

    #skewness & kurtosis
    sk = skew(fft_x)
    def create_feature_array(self, current_epoch):
        xData = current_epoch['X'].to_numpy(
        )  #self.plot_signal(xData, 'xData')
        yData = current_epoch['Y'].to_numpy()
        zData = current_epoch['Z'].to_numpy()

        # Filter the data
        fc = 5  # Cut-off frequency of the filter
        w = fc / (20 / 2)  # Normalize the frequency
        b, a = signal.butter(5, w, 'low')
        xDataFilt = signal.filtfilt(b, a, xData)
        yDataFilt = signal.filtfilt(b, a, yData)
        zDataFilt = signal.filtfilt(b, a, zData)

        # Calculate vector magnitude
        vmData = np.sqrt(xDataFilt**2 + yDataFilt**2 + zDataFilt**2)

        feature_array = []
        # Average value in signal buffer for all acceleration components
        feature_array = np.append(feature_array, np.mean(xDataFilt))
        feature_array = np.append(feature_array, np.mean(yDataFilt))
        feature_array = np.append(feature_array, np.mean(zDataFilt))
        feature_array = np.append(feature_array, np.mean(vmData))
        # Standard deviation
        feature_array = np.append(feature_array, np.std(xDataFilt))
        feature_array = np.append(feature_array, np.std(yDataFilt))
        feature_array = np.append(feature_array, np.std(zDataFilt))
        feature_array = np.append(feature_array, np.std(vmData))
        # Median absolute deviation
        feature_array = np.append(feature_array,
                                  stats.median_absolute_deviation(xDataFilt))
        feature_array = np.append(feature_array,
                                  stats.median_absolute_deviation(yDataFilt))
        feature_array = np.append(feature_array,
                                  stats.median_absolute_deviation(zDataFilt))
        feature_array = np.append(feature_array,
                                  stats.median_absolute_deviation(vmData))
        # Maximum sample
        feature_array = np.append(feature_array, np.max(xDataFilt))
        feature_array = np.append(feature_array, np.max(yDataFilt))
        feature_array = np.append(feature_array, np.max(zDataFilt))
        feature_array = np.append(feature_array, np.max(vmData))
        # Minimum sample
        feature_array = np.append(feature_array, np.min(xDataFilt))
        feature_array = np.append(feature_array, np.min(yDataFilt))
        feature_array = np.append(feature_array, np.min(zDataFilt))
        feature_array = np.append(feature_array, np.min(vmData))
        # Signal magnitude area
        feature_array = np.append(feature_array, np.trapz(xDataFilt))
        feature_array = np.append(feature_array, np.trapz(yDataFilt))
        feature_array = np.append(feature_array, np.trapz(zDataFilt))
        feature_array = np.append(feature_array, np.trapz(vmData))
        # Energy measure
        energy = np.sum(xDataFilt**2) / len(xDataFilt)
        feature_array = np.append(feature_array, energy)
        energy = np.sum(yDataFilt**2) / len(yDataFilt)
        feature_array = np.append(feature_array, energy)
        energy = np.sum(zDataFilt**2) / len(zDataFilt)
        feature_array = np.append(feature_array, energy)
        energy = np.sum(vmData**2) / len(vmData)
        feature_array = np.append(feature_array, energy)
        # Inter-quartile range
        feature_array = np.append(feature_array, stats.iqr(xDataFilt, axis=0))
        feature_array = np.append(feature_array, stats.iqr(yDataFilt, axis=0))
        feature_array = np.append(feature_array, stats.iqr(zDataFilt, axis=0))
        feature_array = np.append(feature_array, stats.iqr(vmData, axis=0))
        # Autocorrelation features for all three acceleration components (3 each): height of main peak; height and position of second peak - Not sure this is right?
        autocorrelation = np.correlate(xDataFilt, xDataFilt, mode='full')
        autocorrelation = autocorrelation[len(xDataFilt) - 1:][0]
        feature_array = np.append(feature_array, autocorrelation)

        autocorrelation = np.correlate(yDataFilt, yDataFilt, mode='full')
        autocorrelation = autocorrelation[len(yDataFilt) - 1:][0]
        feature_array = np.append(feature_array, autocorrelation)

        autocorrelation = np.correlate(zDataFilt, zDataFilt, mode='full')
        autocorrelation = autocorrelation[len(zDataFilt) - 1:][0]
        feature_array = np.append(feature_array, autocorrelation)

        autocorrelation = np.correlate(vmData, vmData, mode='full')
        autocorrelation = autocorrelation[len(vmData) - 1:][0]
        feature_array = np.append(feature_array, autocorrelation)
        # Spectral peak features (12 each): height and position of first 6 peaks
        f, p = signal.periodogram(xDataFilt, 20e0)
        sort_index = np.argsort(p)
        p_sorted = p[sort_index]
        f_sorted = f[sort_index]
        speak_feats = p_sorted[-6:]
        speak_feats2 = f_sorted[-6:]
        feature_array = np.append(feature_array, speak_feats)
        feature_array = np.append(feature_array, speak_feats2)

        f, p = signal.periodogram(yDataFilt, 20e0)
        sort_index = np.argsort(p)
        p_sorted = p[sort_index]
        f_sorted = f[sort_index]
        speak_feats = p_sorted[-6:]
        speak_feats2 = f_sorted[-6:]
        feature_array = np.append(feature_array, speak_feats)
        feature_array = np.append(feature_array, speak_feats2)

        f, p = signal.periodogram(zDataFilt, 20e0)
        sort_index = np.argsort(p)
        p_sorted = p[sort_index]
        f_sorted = f[sort_index]
        speak_feats = p_sorted[-6:]
        speak_feats2 = f_sorted[-6:]
        feature_array = np.append(feature_array, speak_feats)
        feature_array = np.append(feature_array, speak_feats2)

        f, p = signal.periodogram(vmData, 20e0)
        sort_index = np.argsort(p)
        p_sorted = p[sort_index]
        f_sorted = f[sort_index]
        speak_feats = p_sorted[-6:]
        speak_feats2 = f_sorted[-6:]
        feature_array = np.append(feature_array, speak_feats)
        feature_array = np.append(feature_array, speak_feats2)
        # Spectral power features (4 each): total power in 4 adjacent and pre-defined frequency bands
        edges = [0.5, 1.5, 5, 7.5, 10]
        n_feats = len(edges) - 1

        spower_feats = []
        f, p = signal.periodogram(xDataFilt, 20e0)
        for i in range(n_feats):
            mask = (f >= edges[i]) & (f <= edges[i + 1])
            sum(p[mask])
            spower_feats = np.append(spower_feats, sum(p[mask]))
        feature_array = np.append(feature_array, spower_feats)

        spower_feats = []
        f, p = signal.periodogram(yDataFilt, 20e0)
        for i in range(n_feats):
            mask = (f >= edges[i]) & (f <= edges[i + 1])
            sum(p[mask])
            spower_feats = np.append(spower_feats, sum(p[mask]))
        feature_array = np.append(feature_array, spower_feats)

        spower_feats = []
        f, p = signal.periodogram(zDataFilt, 20e0)
        for i in range(n_feats):
            mask = (f >= edges[i]) & (f <= edges[i + 1])
            sum(p[mask])
            spower_feats = np.append(spower_feats, sum(p[mask]))
        feature_array = np.append(feature_array, spower_feats)

        spower_feats = []
        f, p = signal.periodogram(vmData, 20e0)
        for i in range(n_feats):
            mask = (f >= edges[i]) & (f <= edges[i + 1])
            sum(p[mask])
            spower_feats = np.append(spower_feats, sum(p[mask]))
        feature_array = np.append(feature_array, spower_feats)
        return feature_array
Пример #25
0
  plt.title("error between forwardly solved y-field from prediction and true input y-field for the 20th battery in test set with distance 1.59e"+str(key))
  plt.savefig(str(dist)+"/test_20_yfield_diff.png")

  plt.figure()
  imshow_center(np.squeeze(X_test[0,:,:,1])-zf)
  plt.title("error between forwardly solved z-field from prediction and true input y-field for the 20th battery in test set with distance 1.59e"+str(key))
  plt.savefig(str(dist)+"/test_20_zfield_diff.png")

  final_loss = custom_loss_rmse(test_labels_t2b, y_pred_ht2)
  print('final RMSE loss on test set:', final_loss.numpy())
  NRMSE = final_loss/K.mean(test_labels_t2b)
  print('final normalized RMSE loss (div mean) on the test set:', NRMSE.numpy())
  RMSE_range = final_loss /(tf.reduce_max(test_labels_t2b) - tf.reduce_min(test_labels_t2b))
  print('final normalized RMSE loss (div range) on the test set:', RMSE_range.numpy())
  test_arr = tf.keras.backend.flatten(test_labels_t2b).numpy()
  IQR = stats.iqr(test_arr)
  RMSE_IQR = final_loss/IQR
  print('final normalized RMSE loss (div IQR) on the test set:', RMSE_IQR.numpy())
  print('final norm of the difference tensor:', tf.norm(y_pred_ht2-test_labels_t2b).numpy())
  Boll_NRMSE = tf.norm(y_pred_ht2-test_labels_t2b) / tf.norm(test_labels_t2b)
  print('final Bollman normalized RMSE loss on the test set:', Boll_NRMSE.numpy())  

  specs_dict['final_RMSE'] = final_loss.numpy()
  specs_dict['NRMSE'] = NRMSE.numpy()
  specs_dict['RMSE_range'] = RMSE_range.numpy()
  specs_dict['RMSE_IQR'] = RMSE_IQR.numpy()
  specs_dict['Boll_NRMSE'] = Boll_NRMSE.numpy()

  result_dict[key] = specs_dict

  print(100*'|')
Пример #26
0
def get_linguistic_metadata(wsdir, master_anno, idno_file):
    #print(wsdir + master_anno + idno_file)
    root_document = etree.parse(wsdir + master_anno + idno_file).getroot()
    #print(len(root_document))
    specific_namespaces = {
        'tei': 'http://www.tei-c.org/ns/1.0',
        'xi': 'http://www.w3.org/2001/XInclude',
        'cligs': 'https://cligs.hypotheses.org/ns/cligs'
    }

    poss = [
        "conjunction", "determiner", "noun", "verb", "adverb", "adjective",
        "adposition", "punctuation", "pronoun", "date", "number",
        "interjection"
    ]

    ling_measures = "\n"

    types_vaues = root_document.xpath("//tei:w//text()",
                                      namespaces=specific_namespaces)
    ling_measures = ling_measures + '\n\t\t\t\t<measure unit="types">' + str(
        len(set(types_vaues))) + r'</measure>'
    ling_measures += "\n"

    tags = ["s", "w"]
    for tag in tags:
        #print(tag)

        tag_elements = root_document.xpath("//tei:" + tag,
                                           namespaces=specific_namespaces)

        ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's">' + str(
            len(tag_elements)) + r'</measure>'
        #print(len(tag_elements))

        len_tag = []
        amount_act_verbs_text = []

        for tag_element in tag_elements:
            len_tag.append(
                len(" ".join(
                    tag_element.xpath(".//text()",
                                      namespaces=specific_namespaces))))
            if tag == "s":
                amount_active_verbs = len(
                    tag_element.xpath("./tei:w[@cligs:ctag='VMI']",
                                      namespaces=specific_namespaces))
                amount_active_verbs += len(
                    tag_element.xpath("./tei:w[@cligs:ctag='VSI']",
                                      namespaces=specific_namespaces))
                amount_act_verbs_text.append(amount_active_verbs)

        len_tag = np.array(len_tag)

        ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's.num.mean">' + str(
            "%.2f" % round(len_tag.mean(), 2)) + r'</measure>'
        ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's.num.std">' + str(
            "%.2f" % round(len_tag.std(), 2)) + r'</measure>'
        ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's.num.median">' + str(
            "%.2f" % round(np.percentile(len_tag, q=50), 2)) + r'</measure>'
        ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + tag + r's.num.iqr">' + str(
            "%.2f" % round(stats.iqr(len_tag), 2)) + r'</measure>'

        if tag == "s":
            amount_act_verbs_text = np.array(amount_act_verbs_text)

            ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ss.active.verbs.mean">' + str(
                "%.2f" %
                round(amount_act_verbs_text.mean(), 2)) + r'</measure>'
            ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ss.active.verbs.std">' + str(
                "%.2f" % round(amount_act_verbs_text.std(), 2)) + r'</measure>'
            ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ss.active.verbs.median">' + str(
                "%.2f" % round(np.percentile(amount_act_verbs_text, q=50),
                               2)) + r'</measure>'
            ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ss.active.verbs.iqr">' + str(
                "%.2f" %
                round(stats.iqr(amount_act_verbs_text), 2)) + r'</measure>'

            ling_measures += "\n"

    ling_measures += "\n"

    for pos in poss:

        pos_value = str(
            len(
                root_document.xpath("//tei:w[@pos='" + pos + "']",
                                    namespaces=specific_namespaces)))
        ling_measures = ling_measures + '\n\t\t\t\t<measure unit="' + pos + 's">' + pos_value + r'</measure>'

    ling_measures += "\n"

    nes = ["person", "organization", "location", "other"]
    for ne in nes:
        ne_value = str(
            len(
                root_document.xpath("//tei:w[@cligs:neclass='" + ne + "']",
                                    namespaces=specific_namespaces)))
        ling_measures = ling_measures + '\n\t\t\t\t<measure unit="ne.' + ne + 's">' + ne_value + r'</measure>'

    ling_measures += "\n"

    wnlexs = [
        'noun.plant', 'verb.communication', 'noun.food', 'verb.possession',
        'verb.cognition', 'noun.communication', 'noun.state', 'verb.stative',
        'noun.cognition', 'noun.time', 'verb.body', 'noun.person', 'adj.all',
        'noun.quantity', 'noun.phenomenon', 'verb.creation', 'adj.pert',
        'adv.all', 'noun.process', 'noun.artifact', 'verb.perception',
        'noun.feeling', 'verb.weather', 'noun.substance', 'noun.shape',
        'verb.competition', 'verb.motion', 'noun.animal', 'noun.act',
        'noun.body', 'noun.object', 'noun.motive', 'verb.social', 'noun.group',
        'verb.consumption', 'noun.possession', 'noun.Tops', 'noun.relation',
        'noun.attribute', 'verb.emotion', 'noun.location', 'noun.event',
        'verb.contact', 'xxx', 'verb.change'
    ]

    for wnlex in wnlexs:
        wnlex_value = str(
            len(
                root_document.xpath("//tei:w[@cligs:wnlex='" + wnlex + "']",
                                    namespaces=specific_namespaces)))
        ling_measures = ling_measures + '\n\t\t\t\t<measure unit="wnlex.' + wnlex + 's">' + wnlex_value + r'</measure>'

    return ling_measures
Пример #27
0
def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
    """
    Compute the Epps-Singleton (ES) test statistic.

    Test the null hypothesis that two samples have the same underlying
    probability distribution.

    Parameters
    ----------
    x, y : array-like
        The two samples of observations to be tested. Input must not have more
        than one dimension. Samples can have different lengths.
    t : array-like, optional
        The points (t1, ..., tn) where the empirical characteristic function is
        to be evaluated. It should be positive distinct numbers. The default
        value (0.4, 0.8) is proposed in [1]_. Input must not have more than
        one dimension.

    Returns
    -------
    statistic : float
        The test statistic.
    pvalue : float
        The associated p-value based on the asymptotic chi2-distribution.

    See Also
    --------
    ks_2samp, anderson_ksamp

    Notes
    -----
    Testing whether two samples are generated by the same underlying
    distribution is a classical question in statistics. A widely used test is
    the Kolmogorov-Smirnov (KS) test which relies on the empirical
    distribution function. Epps and Singleton introduce a test based on the
    empirical characteristic function in [1]_.

    One advantage of the ES test compared to the KS test is that is does
    not assume a continuous distribution. In [1]_, the authors conclude
    that the test also has a higher power than the KS test in many
    examples. They recommend the use of the ES test for discrete samples as
    well as continuous samples with at least 25 observations each, whereas
    `anderson_ksamp` is recommended for smaller sample sizes in the
    continuous case.

    The p-value is computed from the asymptotic distribution of the test
    statistic which follows a `chi2` distribution. If the sample size of both
    `x` and `y` is below 25, the small sample correction proposed in [1]_ is
    applied to the test statistic.

    The default values of `t` are determined in [1]_ by considering
    various distributions and finding good values that lead to a high power
    of the test in general. Table III in [1]_ gives the optimal values for
    the distributions tested in that study. The values of `t` are scaled by
    the semi-interquartile range in the implementation, see [1]_.

    References
    ----------
    .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
       problem using the empirical characteristic function", Journal of
       Statistical Computation and Simulation 26, p. 177--203, 1986.

    .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
       - the Epps-Singleton two-sample test using the empirical characteristic
       function", The Stata Journal 9(3), p. 454--465, 2009.

    """

    x, y, t = np.asarray(x), np.asarray(y), np.asarray(t)
    # check if x and y are valid inputs
    if x.ndim > 1:
        raise ValueError('x must be 1d, but x.ndim equals {}.'.format(x.ndim))
    if y.ndim > 1:
        raise ValueError('y must be 1d, but y.ndim equals {}.'.format(y.ndim))
    nx, ny = len(x), len(y)
    if (nx < 5) or (ny < 5):
        raise ValueError('x and y should have at least 5 elements, but len(x) '
                         '= {} and len(y) = {}.'.format(nx, ny))
    if not np.isfinite(x).all():
        raise ValueError('x must not contain nonfinite values.')
    if not np.isfinite(y).all():
        raise ValueError('y must not contain nonfinite values.')
    n = nx + ny

    # check if t is valid
    if t.ndim > 1:
        raise ValueError('t must be 1d, but t.ndim equals {}.'.format(t.ndim))
    if np.less_equal(t, 0).any():
        raise ValueError('t must contain positive elements only.')

    # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
    # circular import
    from scipy.stats import iqr
    sigma = iqr(np.hstack((x, y))) / 2
    ts = np.reshape(t, (-1, 1)) / sigma

    # covariance estimation of ES test
    gx = np.vstack(
        (np.cos(ts * x), np.sin(ts * x))).T  # shape = (nx, 2*len(t))
    gy = np.vstack((np.cos(ts * y), np.sin(ts * y))).T
    cov_x = np.cov(gx.T, bias=True)  # the test uses biased cov-estimate
    cov_y = np.cov(gy.T, bias=True)
    est_cov = (n / nx) * cov_x + (n / ny) * cov_y
    est_cov_inv = np.linalg.pinv(est_cov)
    r = np.linalg.matrix_rank(est_cov_inv)
    if r < 2 * len(t):
        warnings.warn('Estimated covariance matrix does not have full rank. '
                      'This indicates a bad choice of the input t and the '
                      'test might not be consistent.')  # see p. 183 in [1]_

    # compute test statistic w distributed asympt. as chisquare with df=r
    g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
    w = n * np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))

    # apply small-sample correction
    if (max(nx, ny) < 25):
        corr = 1.0 / (1.0 + n**(-0.45) + 10.1 * (nx**(-1.7) + ny**(-1.7)))
        w = corr * w

    p = chi2.sf(w, r)

    return Epps_Singleton_2sampResult(w, p)
Пример #28
0
                                  'behavioural_results.npz'),
                     allow_pickle=True,
                     encoding='bytes') as fi:
            snare_deviation_now = fi['snare_deviation'][snareInlier[idx]]
            wdBlk_deviation_now = fi['wdBlk_deviation'][wdBlkInlier[idx]]

            # take only the trials where performance is not nan
            snare_finite = np.isfinite(snare_deviation_now)
            wdBlk_finite = np.isfinite(wdBlk_deviation_now)
            snare_inlier_now = snare_finite  #already filtered for snareInlier in line 41 and 96
            wdBlk_inlier_now = wdBlk_finite

            # take only the trials in range median ± 1.5*IQR
            if iqr_rejection:
                lb_snare = np.median(
                    snare_deviation_now[snare_finite]) - 1.5 * iqr(
                        snare_deviation_now[snare_finite])
                ub_snare = np.median(
                    snare_deviation_now[snare_finite]) + 1.5 * iqr(
                        snare_deviation_now[snare_finite])
                idx_iqr_snare = np.logical_and(snare_deviation_now > lb_snare,
                                               snare_deviation_now < ub_snare)
                snare_inlier_now = np.logical_and(snare_finite, idx_iqr_snare)
                lb_wdBlk = np.median(
                    wdBlk_deviation_now[wdBlk_finite]) - 1.5 * iqr(
                        wdBlk_deviation_now[wdBlk_finite])
                ub_wdBlk = np.median(
                    wdBlk_deviation_now[wdBlk_finite]) + 1.5 * iqr(
                        wdBlk_deviation_now[wdBlk_finite])
                idx_iqr_wdBlk = np.logical_and(wdBlk_deviation_now > lb_wdBlk,
                                               wdBlk_deviation_now < ub_wdBlk)
                wdBlk_inlier_now = np.logical_and(wdBlk_finite, idx_iqr_wdBlk)
Пример #29
0
    def __init__(self, data, channels, sf, hypno, href, preload, use_mne,
                 downsample, kwargs_mne, annotations):
        """Init."""
        # ========================== LOAD DATA ==========================
        # Dialog window if data is None :
        if data is None:
            data = dialog_load(self, "Open dataset", '',
                               "Any EEG files (*.vhdr *.edf *.gdf *.bdf *.eeg "
                               "*.egi *.mff *.cnt *.trc *.set *.rec);;"
                               "BrainVision (*.vhdr);;EDF (*.edf);;"
                               "GDF (*.gdf);;BDF (*.bdf);;Elan (*.eeg);;"
                               "EGI (*.egi);;MFF (*.mff);;CNT (*.cnt);;"
                               "Micromed (*.trc);;EEGLab (*.set);;REC (*.rec)")
            upath = os.path.split(data)[0]
        else:
            upath = ''

        if isinstance(data, str):  # file is defined
            # ---------- USE SLEEP or MNE ----------
            # Find file extension :
            file, ext = get_file_ext(data)
            # Force to use MNE if preload is False :
            use_mne = True if not preload else use_mne
            # Get if the file has to be loaded using Sleep or MNE python :
            sleep_ext = ['.eeg', '.vhdr', '.edf', '.trc', '.rec']
            use_mne = True if ext not in sleep_ext else use_mne

            if use_mne:
                is_mne_installed(raise_error=True)

            # ---------- LOAD THE FILE ----------
            if use_mne:  # Load using MNE functions
                logger.debug("Load file using MNE-python")
                kwargs_mne['preload'] = preload
                args = mne_switch(file, ext, downsample, **kwargs_mne)
            else:  # Load using Sleep functions
                logger.debug("Load file using Sleep")
                args = sleep_switch(file, ext, downsample)
            # Get output arguments :
            (sf, downsample, dsf, data, channels, n, offset, annot) = args
            info = ("Data successfully loaded (%s):"
                    "\n- Sampling-frequency : %.2fHz"
                    "\n- Number of time points (before down-sampling): %i"
                    "\n- Down-sampling frequency : %.2fHz"
                    "\n- Number of time points (after down-sampling): %i"
                    "\n- Number of channels : %i"
                    )
            n_channels, n_pts_after = data.shape
            logger.info(info % (file + ext, sf, n, downsample, n_pts_after,
                                n_channels))
            PROFILER("Data file loaded", level=1)

        elif isinstance(data, np.ndarray):  # array of data is defined
            if not isinstance(sf, (int, float)):
                raise ValueError("When passing raw data, the sampling "
                                 "frequency parameter, sf, must either be an "
                                 "integer or a float.")
            file = annot = None
            offset = datetime.time(0, 0, 0)
            dsf, downsample = get_dsf(downsample, sf)
            n = data.shape[1]
            data = data[:, ::dsf]
        else:
            raise IOError("The data should either be a string which refer to "
                          "the path of a file or an array of raw data of shape"
                          " (n_electrodes, n_time_points).")

        # Keep variables :
        self._file = file
        self._annot_file = np.c_[merge_annotations(annotations, annot)]
        self._N = n
        self._dsf = dsf
        self._sfori = float(sf)
        self._toffset = offset.hour * 3600. + offset.minute * 60. + \
            offset.second
        time = np.arange(n)[::dsf] / sf
        self._sf = float(downsample) if downsample is not None else float(sf)

        # ========================== LOAD HYPNOGRAM ==========================
        # Dialog window for hypnogram :
        if hypno is None:
            hypno = dialog_load(self, "Open hypnogram", upath,
                                "Text file (*.txt);;Elan (*.hyp);;"
                                "CSV file (*.csv);;EDF+ file(*.edf);"
                                ";All files (*.*)")
            hypno = None if hypno == '' else hypno
        if isinstance(hypno, np.ndarray):  # array_like
            if len(hypno) == n:
                hypno = hypno[::dsf]
            else:
                raise ValueError("Then length of the hypnogram must be the "
                                 "same as raw data")
        if isinstance(hypno, str):  # (*.hyp / *.txt / *.csv)
            hypno, _ = read_hypno(hypno, time=time, datafile=file)
            # Oversample then downsample :
            hypno = oversample_hypno(hypno, self._N)[::dsf]
            PROFILER("Hypnogram file loaded", level=1)

        # ========================== CHECKING ==========================
        # ---------- DATA ----------
        # Check data shape :
        if data.ndim is not 2:
            raise ValueError("The data must be a 2D array")
        nchan, npts = data.shape

        # ---------- CHANNELS ----------
        if (channels is None) or (len(channels) != nchan):
            warn("The number of channels must be " + str(nchan) + ". Default "
                 "channel names will be used instead.")
            channels = ['chan' + str(k) for k in range(nchan)]
        # Clean channel names :
        patterns = ['eeg', 'EEG', 'ref']
        chanc = []
        for c in channels:
            # Remove informations after . :
            c = c.split('.')[0]
            c = c.split('-')[0]
            # Exclude patterns :
            for i in patterns:
                c = c.replace(i, '')
            # Remove space :
            c = c.replace(' ', '')
            c = c.strip()
            chanc.append(c)

        # ---------- STAGE ORDER ----------
        # href checking :
        absref = ['art', 'wake', 'n1', 'n2', 'n3', 'rem']
        absint = [-1, 0, 1, 2, 3, 4]
        if href is None:
            href = absref
        elif (href is not None) and isinstance(href, list):
            # Force lower case :
            href = [k.lower() for k in href]
            # Check that all stage are present :
            for k in absref:
                if k not in href:
                    raise ValueError(k + " not found in href.")
            # Force capitalize :
            href = [k.capitalize() for k in href]
            href[href.index('Rem')] = 'REM'
        else:
            raise ValueError("The href parameter must be a list of string and"
                             " must contain 'art', 'wake', 'n1', 'n2', 'n3' "
                             "and 'rem'")
        # Conversion variable :
        absref = ['Art', 'Wake', 'N1', 'N2', 'N3', 'REM']
        conv = {absint[absref.index(k)]: absint[i] for i, k in enumerate(href)}

        # ---------- HYPNOGRAM ----------
        if hypno is None:
            hypno = np.zeros((npts,), dtype=np.float32)
        else:
            n = len(hypno)
            # Check hypno values :
            if (hypno.min() < -1.) or (hypno.max() > 4) or (n != npts):
                warn("\nHypnogram values must be comprised between -1 and 4 "
                     "(see Iber et al. 2007). Use:\n-1 -> Art (optional)\n 0 "
                     "-> Wake\n 1 -> N1\n 2 -> N2\n 3 -> N4\n 4 -> REM\nEmpty "
                     "hypnogram will be used instead")
                hypno = np.zeros((npts,), dtype=np.float32)

        # ---------- SCALING ----------
        # Assume that the inter-quartile amplitude of EEG data is ~50 uV
        iqr_chan = iqr(data[:, :int(data.shape[1] / 4)], axis=-1)
        bad_iqr = iqr_chan < 1.

        if np.any(bad_iqr):
            mult_fact = np.zeros_like(iqr_chan)
            iqr_chan[iqr_chan == 0.] = 1.
            mult_fact[bad_iqr] = np.floor(np.log10(50. / iqr_chan[bad_iqr]))
            data *= 10. ** mult_fact[..., np.newaxis]
            warn("Wrong channel data amplitude. ")

        # ---------- CONVERSION ----------=
        # Convert data and hypno to be contiguous and float 32 (for vispy):
        self._data = vispy_array(data)
        self._hypno = vispy_array(hypno)
        self._time = vispy_array(time)
        self._channels = chanc
        self._href = href
        self._hconv = conv
        PROFILER("Check data", level=1)
Пример #30
0
def evaluate_model_helper(df_input,
                          choice,
                          params,
                          seed,
                          scoring_metrics,
                          eval_model,
                          num_folds=10,
                          eval_method="robust",
                          verbose=0,
                          cv_generator=None):
    """
    Evaluate the model performance with the given parameters, cv and seed
    With the given metric list

    :param df_input: Pandas DataFrame, the original input dataset
    :param choice: String, one of "Metal", "Insulator", "MIT"
    :param params: Dictionary, the best parameters from hyperparameter tuning
    :param seed: Integer, the random seed for reproducibility
    :param scoring_metrics: List, a list of scoring metrics
    :param eval_model: sklearn model, the model to evaluate
    :param num_folds: Integer, the number of stratified folds (default: 10)
    :param eval_method: String, one of "robust", "standard"
    :param verbose: Int, if 1, print out the intermediate results
    :param cv_generator: Cross validator, if None will use stratified k-fold
    :return: Dictionary
    """

    X_features, y_labels = load_data(df_input, choice)
    if eval_model.__name__ == "LogisticRegression":
        X_features = RobustScaler().fit_transform(X_features)

    fit_params_dict = None
    # if (multiclass & XGBClassifier) | GradientBoostingClassifier, specify the sample weights
    if ((choice == "Multiclass") and (eval_model.__name__ == "XGBClassifier")) or \
            (eval_model.__name__ == "GradientBoostingClassifier"):
        fit_params_dict = {
            "sample_weight":
            compute_sample_weight(class_weight="balanced", y=y_labels)
        }

    # if cv_folds is not specified
    if not cv_generator:
        # initialize the stratified k-folds
        cv_generator = StratifiedKFold(n_splits=num_folds,
                                       shuffle=True,
                                       random_state=seed)

    # initialize the xgboost classifier with the tuned parameters
    model_to_eval = eval_model(**params[choice])
    # evaluate the tuned model with stratified k-fold cv
    cv_scores = cross_validate(model_to_eval,
                               X_features,
                               y_labels,
                               scoring=scoring_metrics,
                               cv=cv_generator,
                               error_score=np.nan,
                               fit_params=fit_params_dict)

    if verbose == 1:
        print(
            "\nEvaluating the {label} vs. non-{label} binary classifier (seed={rand_seed})"
            .format(label=choice, rand_seed=seed))
        if num_folds:
            print("For {} folds".format(num_folds))
        if eval_method == "robust":
            printout_lst = [
                "Median {}: {:0.2f} w/ IQR: {:0.2f}".format(
                    metric, np.nanmedian(cv_scores["test_" + metric]),
                    iqr(cv_scores["test_" + metric], nan_policy="omit"))
                for metric in scoring_metrics
            ]
        elif eval_method == "standard":
            printout_lst = [
                "Mean {}: {:0.2f} w/ std: {:0.2f}".format(
                    metric, np.nanmean(cv_scores["test_" + metric]),
                    np.nanstd(cv_scores["test_" + metric]))
                for metric in scoring_metrics
            ]

        print(*printout_lst, sep="\n")
        print("-----------------------------------\n")

    return {metric: cv_scores["test_" + metric] for metric in scoring_metrics}
Пример #31
0
### Statistical significance of the results
nruns = 1000
sep_limit = 0.02
p_median, p_mean = FragMent.Stat_Sig(nn_seps, "NNS", boundary, nruns,
                                     sep_limit)
ks_stat, p_ks = FragMent.KS_test(nn_seps, "NNS", boundary, nruns, sep_limit)
ad_stat, crit_vals, p_ad = FragMent.AD_test(nn_seps, "NNS", boundary, nruns,
                                            sep_limit)

### Report results
print " "
print " "
print "######## Nearest neighbour results ########"
print "The median and interquartile range of the distribution are: ", numpy.median(
    nn_seps), iqr(nn_seps)
print "The p-value using the median-interquartile range NHT is   : ", p_median
print "The mean and standard deviation of the distribution are   : ", numpy.mean(
    nn_seps), numpy.std(nn_seps)
print "The p-value using the mean-standard deviation NHT is      : ", p_mean
print "The p-values from the K-S and A-D test are                : ", p_ks, p_ad

### Perform a minimum spanning test on the data
mst_seps, mst = FragMent.MST(pos)

### Statistical significance of the results
p_median, p_mean = FragMent.Stat_Sig(mst_seps, "MST", boundary, nruns,
                                     sep_limit)
ks_stat, p_ks = FragMent.KS_test(mst_seps, "MST", boundary, nruns, sep_limit)
ad_stat, crit_vals, p_ad = FragMent.AD_test(mst_seps, "MST", boundary, nruns,
                                            sep_limit)
Пример #32
0
def find_iqr(dframe):
    return [int(iqr([v for k, v in enumerate(dframe.ix[i].values) if k != i])) for i
            in
            xrange(len(dframe))]
Пример #33
0
    date_parser=lambda x: pd.to_datetime(float(x) + 28800000000000))
p = p.drop(columns=['name'])

d = p['Press'].values
l = p.index

mean = np.mean(d)
trimmean = stats.trim_mean(d, 0.2)
median = np.median(d)

meanv = np.array([np.mean(d)] * len(d))
trimmeanv = np.array([stats.trim_mean(d, 0.2)] * len(d))
medianv = np.array([np.median(d)] * len(d))

stdv = np.array([np.std(d)] * len(d))
iqrv = np.array([stats.iqr(d)] * len(d))
madv = np.array([stats.median_absolute_deviation(d)] * len(d))

print("std =", format(np.std(d), ".2f"), " iqr =", format(stats.iqr(d), ".2f"),
      " mad =", format(stats.median_absolute_deviation(d), ".2f"))

mean_up = meanv + stdv
mean_down = meanv - stdv

median_up = medianv + iqrv
median_down = medianv - iqrv

mad_up = trimmeanv + madv
mad_down = trimmeanv - madv

#plt.figure(figsize=(10,7))
Пример #34
0
def performPrep(eeg, refChan, srate, linenoise, referenceType='robust'):
    dim = np.shape(eeg)
    if refChan != 0:
        eeg_chans = np.setdiff1d(
            range(0, dim[0]),
            refChan - 1)  #remove the reference channel from the eeg channels
        eeg = eeg[eeg_chans, :]

    #finding bad channels

    #finding channels with NaNs or constant values for long periods of time
    org_dim = np.shape(eeg)

    originalChannels = np.arange(org_dim[0])
    channelsInterpolate = originalChannels
    nanChannelMask = [False] * org_dim[0]
    noSignalChannelMask = [False] * org_dim[0]

    for i in range(0, org_dim[0]):
        nanChannelMask[i] = np.sum(np.isnan(eeg[i, :])) > 0
    for i in range(0, org_dim[0]):
        noSignalChannelMask[i] = robust.mad(eeg[i, :]) < 10**(-10) or np.std(
            eeg[i, :]) < 10**(-10)
    badChannelsfromNans = channelsInterpolate[nanChannelMask]
    badChannelsfromNoData = channelsInterpolate[noSignalChannelMask]
    for i in range(0, org_dim[0]):
        if nanChannelMask[i] == True or noSignalChannelMask[i] == True:
            eeg = np.delete(eeg, i, axis=0)

    channelsInterpolate = np.setdiff1d(
        channelsInterpolate,
        np.union1d(
            badChannelsfromNans,
            badChannelsfromNoData))  #channels to be used for interpolation
    evaluationChannels = channelsInterpolate
    new_dim = np.shape(eeg)

    # find channels that have abnormally high or low amplitude
    robustchanneldeviation = np.zeros(org_dim[0])
    badChannelFromDeviationMask = [False] * (new_dim[0])
    channeldeviation = np.zeros(new_dim[0])
    for i in range(0, new_dim[0]):
        channeldeviation[i] = 0.7413 * iqr(eeg[i, :])

    channeldeviationSD = 0.7413 * iqr(channeldeviation)
    channeldeviationMedian = np.nanmedian(channeldeviation)
    robustchanneldeviation[evaluationChannels] = np.divide(
        np.subtract(channeldeviation, channeldeviationMedian),
        channeldeviationSD)
    for i in range(0, new_dim[0]):
        badChannelFromDeviationMask[i] = abs(
            robustchanneldeviation[i]) > 5 or np.isnan(
                robustchanneldeviation[i])

    badChannelsfromDeviation = evaluationChannels[badChannelFromDeviationMask]

    #finding channels with high frequency noise
    if srate > 100:
        eeg = np.transpose(eeg)
        dim = np.shape(eeg)
        X = np.zeros((dim[0], dim[1]))
        B = filter_design(100,
                          A=np.array([1, 1, 0, 0]),
                          F=np.array([0, .36, 0.4, 1]),
                          srate=250)
        for i in range(0, dim[1]):
            X[:, i] = signal.filtfilt(B, 1, eeg[:, i])

        noisiness = np.divide(robust.mad(np.subtract(eeg, X)), robust.mad(X))
        noisinessmedian = np.nanmedian(noisiness)
        noiseSD = np.median(
            np.absolute(np.subtract(noisiness, np.median(noisiness)))) * 1.4826
        zscoreHFNoise = np.divide(np.subtract(noisiness, noisinessmedian),
                                  noiseSD)
        HFnoisemask = [False] * new_dim[0]
        for i in range(0, new_dim[0]):
            HFnoisemask[i] = zscoreHFNoise[i] > 5 or np.isnan(zscoreHFNoise[i])
    else:
        X = eeg
        noisinessmedian = 0
        noisinessSD = 1
        zscoreHFNoise = np.zeros(dim[1], 1)
        badChannelsfromHFnoise = []
    badChannelsfromHFnoise = evaluationChannels[HFnoisemask]
    #finding channels by correlation
    correlationSeconds = 1  # default value
    correlationFrames = correlationSeconds * srate
    correlationWindow = np.arange(correlationFrames)
    correlationOffsets = np.arange(1, dim[0] - correlationFrames,
                                   correlationFrames)
    Wcorrelation = len(correlationOffsets)
    maximumCorrelations = np.ones((org_dim[0], Wcorrelation))
    drop_out = np.zeros((dim[1], Wcorrelation))
    channelCorrelation = np.ones((Wcorrelation, dim[1]))
    noiselevels = np.zeros((Wcorrelation, dim[1]))
    channelDeviations = np.zeros((Wcorrelation, dim[1]))
    drop = np.zeros((Wcorrelation, dim[1]))
    n = len(correlationWindow)
    XWin = np.reshape(np.transpose(X[0:n * Wcorrelation, :]),
                      (dim[1], n, Wcorrelation),
                      order='F')
    dataWin = np.reshape(np.transpose(eeg[0:n * Wcorrelation, :]),
                         (dim[1], n, Wcorrelation),
                         order='F')
    for k in range(0, Wcorrelation):
        eegportion = np.transpose(np.squeeze(XWin[:, :, k]))
        dataportion = np.transpose(np.squeeze(dataWin[:, :, k]))
        windowCorrelation = np.corrcoef(np.transpose(eegportion))
        abs_corr = np.abs(
            np.subtract(windowCorrelation,
                        np.diag(np.diag(windowCorrelation))))
        channelCorrelation[k, :] = np.quantile(
            abs_corr, 0.98, axis=0)  # problem is here is solved
        noiselevels[k, :] = np.divide(
            robust.mad(np.subtract(dataportion, eegportion)),
            robust.mad(eegportion))
        channelDeviations[k, :] = 0.7413 * iqr(dataportion, axis=0)

    for i in range(0, Wcorrelation):
        for j in range(0, dim[1]):
            drop[i, j] = np.int(
                np.isnan(channelCorrelation[i, j])
                or np.isnan(noiselevels[i, j]))
            if drop[i, j] == 1:
                channelDeviations[i, j] = 0
                noiselevels[i, j] = 0

    maximumCorrelations[evaluationChannels, :] = np.transpose(
        channelCorrelation)
    drop_out[:] = np.transpose(drop)
    noiselevels_out = np.transpose(noiselevels)
    channelDeviations_out = np.transpose(channelDeviations)
    thresholdedCorrelations = maximumCorrelations < 0.4
    thresholdedCorrelations = thresholdedCorrelations.astype(int)
    fractionBadCorrelationWindows = np.mean(thresholdedCorrelations, axis=1)
    fractionBadDropOutWindows = np.mean(drop_out, axis=1)

    badChannelsFromCorrelation = np.where(fractionBadCorrelationWindows > 0.01)
    badChannelsFromCorrelation_out = badChannelsFromCorrelation[:]
    badChannelsFromDropOuts = np.where(fractionBadDropOutWindows > 0.01)
    badChannelsFromDropOuts_out = badChannelsFromDropOuts[:]
    #medianMaxCorrelation = np.median(maximumCorrelations, 2);

    badChannelsfromSNR = np.union1d(badChannelsFromCorrelation_out,
                                    badChannelsfromHFnoise)
    noisyChannels = np.union1d(
        np.union1d(
            np.union1d(
                badChannelsfromDeviation,
                np.union1d(badChannelsFromCorrelation_out,
                           badChannelsFromDropOuts_out)), badChannelsfromSNR),
        np.union1d(badChannelsfromNans, badChannelsfromNoData))
    print(noisyChannels)
Пример #35
0
"""

for col in num_cols:
    regents_df[col] = regents_df[col].apply(lambda x: float(x))

means = [np.mean(regents_df[col].astype(float)) for col in num_cols]
regents_stats = pd.DataFrame(means, index=num_cols, columns=['col_mean'])

regents_stats['stdev'] = [
    np.std(regents_df[col].astype(float)) for col in num_cols
]
regents_stats['col_median'] = [
    np.median(regents_df[col].astype(float)) for col in num_cols
]
regents_stats['iqr'] = [
    st.iqr(regents_df[col].astype(float)) for col in num_cols
]
regents_stats['five_num'] = [
    np.percentile(regents_df[col].astype(float), [0, 25, 50, 75, 100])
    for col in num_cols
]
regents_stats['deciles'] = [
    np.percentile(regents_df[col].astype(float),
                  [10, 20, 30, 40, 50, 60, 70, 80, 90]) for col in num_cols
]

print(regents_stats['col_median'])
algebra_scores = regents_df[regents_df['exam_name'] == 'common core algebra']
means = [np.mean(algebra_scores[col].astype(float)) for col in num_cols]
alg_stats = pd.DataFrame(means, index=num_cols, columns=['col_mean'])
Пример #36
0
def _get_features_for_vector(vec, prefix):
    # Prepare frequently used values
    _len = len(vec)
    _max = max(vec)
    _pos_max = vec.index(_max)
    _min = min(vec)
    _pos_min = vec.index(_min)
    _range = abs(_max - _min)
    _var = variance(vec)
    _std = stdev(vec)
    _mean = mean(vec)
    _mode = mode(vec, axis=None)[0][0]
    _median = median(vec)
    features = {
        f'{prefix} | MAX': _max,
        f'{prefix} | MIN': _min,
        # f'{prefix} | POSITION OF MAX': _pos_max, # TOO BIG
        # f'{prefix} | POSITION OF MIN': _pos_min, # TOO BIG
        f'{prefix} | RELATIVE POSITION OF MAX': safe_div(_pos_max, _len),
        f'{prefix} | RELATIVE POSITION OF MIN': safe_div(_pos_min, _len),
        f'{prefix} | RANGE': _range,
        f'{prefix} | RELATIVE RANGE': safe_div(_range, _max),
        f'{prefix} | RELATIVE VARIATION RANGE': safe_div(_range, _mean),
        f'{prefix} | INTERQUARTILE RANGE': iqr(vec),
        f'{prefix} | RELATIVE INTERQUARTILE RANGE': safe_div(iqr(vec), _max),
        f'{prefix} | INTERDECILE RANGE': quantile(vec, 0.9) - quantile(vec, 0.1),
        f'{prefix} | RELATIVE INTERDECILE RANGE': safe_div(quantile(vec, 0.9) - quantile(vec, 0.1), _max),
        f'{prefix} | INTERPERCENTILE RANGE': quantile(vec, 0.99) - quantile(vec, 0.01),
        f'{prefix} | RELATIVE INTERPERCENTILE RANGE': safe_div(quantile(vec, 0.99) - quantile(vec, 0.01), _max),
        f'{prefix} | STUDENTIZED RANGE': safe_div(_range, _var),
        f'{prefix} | MEAN': _mean,
        # f'{prefix} | GEOMETRIC MEAN': gmean(vec), # always NaN
        # f'{prefix} | HARMONIC MEAN': harmonic_mean(vec), # harmonic mean does not support negative values
        f'{prefix} | MEAN EXCLUDING OUTLIERS (10)': trim_mean(vec, 0.1),
        f'{prefix} | MEAN EXCLUDING OUTLIERS (20)': trim_mean(vec, 0.2),
        f'{prefix} | MEAN EXCLUDING OUTLIERS (30)': trim_mean(vec, 0.3),
        f'{prefix} | MEAN EXCLUDING OUTLIERS (40)': trim_mean(vec, 0.4),
        f'{prefix} | MEAN EXCLUDING OUTLIERS (50)': trim_mean(vec, 0.5),
        f'{prefix} | MEDIAN': _median,
        f'{prefix} | MODE': _mode,
        f'{prefix} | VARIANCE': _var,
        f'{prefix} | STANDARD DEVIATION': _std,
        f'{prefix} | MEDIAN ABSOLUTE DEVIATION': median_absolute_deviation(vec),
        # f'{prefix} | GEOMETRIC STANDARD DEVIATION': gstd(vec), # The geometric standard deviation is defined for
        # strictly positive values only.
        f'{prefix} | RELATIVE STANDARD DEVIATION': safe_div(_std, _mean),
        f'{prefix} | INDEX OF DISPERSION': safe_div(_var, _mean),
        # f'{prefix} | 3rd MOMENT': moment(_var, 3), always 0
        # f'{prefix} | 4th MOMENT': moment(_var, 4), always 0
        # f'{prefix} | 5th MOMENT': moment(_var, 5), always 0
        # f'{prefix} | 6th MOMENT': moment(_var, 6), always 0
        f'{prefix} | KURTOSIS': kurtosis(vec),
        f'{prefix} | SKEWNESS': skew(vec),
        f'{prefix} | PEARSONS 1st SKEWNESS COEFFICIENT': safe_div((3 * (_mean - _mode)), _std),
        f'{prefix} | PEARSONS 2nd SKEWNESS COEFFICIENT': safe_div(3 * (_mean - _median), _std),
        f'{prefix} | 1st PERCENTILE': percentile(vec, 1),
        f'{prefix} | 5th PERCENTILE': percentile(vec, 5),
        f'{prefix} | 10th PERCENTILE': percentile(vec, 10),
        f'{prefix} | 20th PERCENTILE': percentile(vec, 20),
        f'{prefix} | 1st QUARTILE': percentile(vec, 25),
        f'{prefix} | 30th PERCENTILE': percentile(vec, 30),
        f'{prefix} | 40th PERCENTILE': percentile(vec, 40),
        f'{prefix} | 60th PERCENTILE': percentile(vec, 60),
        f'{prefix} | 70th PERCENTILE': percentile(vec, 70),
        f'{prefix} | 3th QUARTILE': percentile(vec, 75),
        f'{prefix} | 80th PERCENTILE': percentile(vec, 80),
        f'{prefix} | 90th PERCENTILE': percentile(vec, 90),
        f'{prefix} | 95th PERCENTILE': percentile(vec, 95),
        f'{prefix} | 99th PERCENTILE': percentile(vec, 99),
        # f'{prefix} | SHANNON ENTROPY': entropy(vec), # MAKE NO SENSE HERE
        # f'{prefix} | MODULATION': _range / (_max + _min) # MAKE NO SENSE HERE
    }
    return features
Пример #37
0
def trend(dow, intersection, direction, int_leg, new_dataframe,
          iqr_multiplier):

    ## trend: str str str str str => matplotlibplot

    ## requires: dow, intersection, direction, int_leg: strings as used in all previous functions
    ##           new_dataframe: pd.DataFrame containing two columns- 'datetime_bin' and 'volume'
    ##                          The format of this dataframe is the same as the one returned by
    ##                          the grab function

    # grab data

    data = grab(dow, intersection, direction, int_leg)
    data['datetime_bin'] = pd.to_datetime(data['datetime_bin'])
    intervals = len(
        data.groupby(data['datetime_bin'].dt.strftime('%d')).count()
        ['datetime_bin'])  #number of periods

    rdata = ts(append(data.volume.values, new_dataframe.volume.values),
               frequency=96)

    # decompose

    rstring = """function(testdata){
                library(forecast)
                decomp <- stl(testdata, s.window = 'periodic')
                outdf<-as.data.frame(decomp$time.series)
                outdf
                }"""

    rfunc = robjects.r(rstring)
    r_df = rfunc(rdata)
    decomp_as_df = pandas2ri.ri2py(r_df)

    trendvalues = decomp_as_df['trend']  #all data including new data
    oldtrend = trendvalues[0:len(data) - 1]  #old data

    # Create bounds (via scipy.stats.iqr and numpy.percentile)

    pct = [percentile(oldtrend, 25),
           percentile(oldtrend, 75)]  #25th percentile and 75th
    iqrange = (iqr(oldtrend))
    lower_bound = pct[0] - (iqrange * iqr_multiplier)
    upper_bound = pct[1] + (iqrange * iqr_multiplier)

    # if more than a quarter of the new data sits outside the bounds, do the following:

    if list(lower_bound <= trendvalues[len(data):]).count(False) >= 0.25 * len(
            new_dataframe) or list(upper_bound >= trendvalues[len(data):]
                                   ).count(False) >= 0.25 * len(new_dataframe):

        # Plot Data With Bounds and data cutoff

        plt.ioff()
        plt.figure(figsize=(18, 10))
        plt.plot(trendvalues,
                 linewidth=2,
                 color='blue',
                 alpha=0.7,
                 label='Trend Volume')

        plt.axvline(x=(96 * intervals) - 1,
                    c='#FF00FF',
                    linewidth=4,
                    alpha=0.7,
                    linestyle='--',
                    label='New Data Cutoff')  # data cut off point
        plt.axhline(lower_bound, alpha=0.5, color='c')  #lower bound
        plt.axhline(upper_bound, alpha=0.5, color='c')  #upper bound
        plt.axhspan(lower_bound,
                    upper_bound,
                    alpha=0.1,
                    facecolor='c',
                    label='Trend Bounds')  #spread
        plt.title("%s Trendline with New Data (%s Leg, %s)" %
                  (intersection, int_leg, direction))
        plt.rc('font', **font)
        plt.ylabel("Volume Trend")
        plt.legend()
        g.trend_graph_count += 1  #update graph count

        plt.savefig(path + '\\trend_%s.png' % (g.trend_graph_count), dpi=300)
Пример #38
0
    def find_bad_by_correlation(self,
                                correlation_secs=1.0,
                                correlation_threshold=0.4,
                                frac_bad=0.1):
        """Find correlation between the low frequency components of the EEG below 50 Hz.

        Correlation is done using a sliding non-overlapping time window. The maximum absolute correlation is
        as the 98th percentile of the absolute values of the correlations with the other channels
        If the maximum correlation is less than 0.4 then the channel is designated as bad by corre-
        lation.

        Parameters
        ----------
        correlation_secs: float
                          length of the correlation time window (default: 1 secs).
        correlation_threshold: float
                               correlation threshold below which channel is marked bad.
        frac_bad: float
                  percentage of data windows in which the correlation threshold was not surpassed and
                  if a channel gets a value of greater than 1%, it is designated bad.
        """
        self.find_bad_by_hfnoise()  # since filtering is performed there
        correlation_frames = correlation_secs * self.sample_rate
        correlation_window = np.arange(correlation_frames)
        correlation_offsets = np.arange(
            1, (self.new_dimensions[1] - correlation_frames),
            correlation_frames)
        w_correlation = len(correlation_offsets)
        maximum_correlations = np.ones(
            (self.original_dimensions[0], w_correlation))
        drop_out = np.zeros((self.new_dimensions[0], w_correlation))
        channel_correlation = np.ones((w_correlation, self.new_dimensions[0]))
        noiselevels = np.zeros((w_correlation, self.new_dimensions[0]))
        channel_deviations = np.zeros((w_correlation, self.new_dimensions[0]))
        drop = np.zeros((w_correlation, self.new_dimensions[0]))
        len_correlation_window = len(correlation_window)
        EEGData = np.transpose(self.EEGData)
        EEG_new_win = np.reshape(
            np.transpose(EEGData[0:len_correlation_window * w_correlation, :]),
            (self.new_dimensions[0], len_correlation_window, w_correlation),
            order="F",
        )
        data_win = np.reshape(
            np.transpose(self.EEGData_beforeFilt[0:len_correlation_window *
                                                 w_correlation, :]),
            (self.new_dimensions[0], len_correlation_window, w_correlation),
            order="F",
        )
        for k in range(0, w_correlation):
            eeg_portion = np.transpose(np.squeeze(EEG_new_win[:, :, k]))
            data_portion = np.transpose(np.squeeze(data_win[:, :, k]))
            window_correlation = np.corrcoef(np.transpose(eeg_portion))
            abs_corr = np.abs(
                np.subtract(window_correlation,
                            np.diag(np.diag(window_correlation))))
            channel_correlation[k, :] = np.quantile(abs_corr, 0.98, axis=0)
            noiselevels[k, :] = np.divide(
                robust.mad(np.subtract(data_portion, eeg_portion), c=1),
                robust.mad(eeg_portion, c=1),
            )
            channel_deviations[k, :] = 0.7413 * iqr(data_portion, axis=0)
        for i in range(0, w_correlation):
            for j in range(0, self.new_dimensions[0]):
                drop[i, j] = np.int(
                    np.isnan(channel_correlation[i, j])
                    or np.isnan(noiselevels[i, j]))
                if drop[i, j] == 1:
                    channel_deviations[i, j] = 0
                    noiselevels[i, j] = 0
        maximum_correlations[self.channels_interpolate, :] = np.transpose(
            channel_correlation)
        drop_out[:] = np.transpose(drop)
        thresholded_correlations = maximum_correlations < correlation_threshold
        thresholded_correlations = thresholded_correlations.astype(int)
        fraction_BadCorrelationWindows = np.mean(thresholded_correlations,
                                                 axis=1)
        fraction_BadDropOutWindows = np.mean(drop_out, axis=1)

        bad_correlation_channels_idx = np.argwhere(
            fraction_BadCorrelationWindows > frac_bad)
        bad_correlation_channels_name = self.ch_names_original[
            bad_correlation_channels_idx.astype(int)]
        self.bad_by_correlation = [i[0] for i in bad_correlation_channels_name]

        dropout_channels_idx = np.argwhere(
            fraction_BadDropOutWindows > frac_bad)
        dropout_channels_name = self.ch_names_original[
            dropout_channels_idx.astype(int)]
        self.bad_by_dropout = [i[0] for i in dropout_channels_name]
        return None
df.head()


# In[ ]:


df[f_num]


# In[ ]:


from scipy import stats 
IQR=[]
for i in f_num:
    IQR.append(stats.iqr(df[i], interpolation = 'midpoint'))
IQR


# In[ ]:


limits = dict()
j=0
for i in f_num:
    Q1 = np.percentile(df[i], 25, interpolation = 'midpoint')  
    Q3 = np.percentile(df[i], 75, interpolation = 'midpoint')  
    #print(Q1, Q3)
    limits[i] = [Q1-(1.5*IQR[j]), Q3+(1.5*IQR[j])]
    j+=1