Exemplo n.º 1
0
 def trimFreq(self, fmin=False, fmax=False):
     self.loc_fmin = 0
     self.loc_fmax = -1
     if fmin:
         self.loc_fmin = np_amin(np_where(self.f > fmin))
     if fmax:
         self.loc_fmax = np_amin(np_where(self.f > fmax))
     if self.loc_fmax == -1:
         self.loc_fmax = self.f.size
     self.fTrim = self.f[self.loc_fmin:self.loc_fmax + 1]
     self.ampTrim = self.amp[self.loc_fmin:self.loc_fmax + 1]
     return
Exemplo n.º 2
0
 def trimTime(self, tmin=False, tmax=False):
     loc_tmin = 0
     loc_tmax = -1
     if tmin:
         loc_tmin = np_amin(np_where(self.t > tmin))
     if tmax:
         loc_tmax = np_amin(np_where(self.t > tmax))
     if loc_tmax == -1:
         loc_tmax = self.t.size
     self.t[:loc_tmax - loc_tmin] = self.t[loc_tmin:loc_tmax]
     self.t.resize(loc_tmax - loc_tmin, refcheck=False)
     self.wf = self.wf[loc_tmin:loc_tmax]
     self.FFT()
     return
Exemplo n.º 3
0
Arquivo: ttt.py Projeto: vhalis/2048
 def check_for_win(self):
     # 8 lines exist that can be the same symbol
     # Check verticals and horizontals
     # This only works for two dimensions right now
     p1 = set(np_where(self._tiles.flat == 1)[0])
     for combo in self.WINNING_COMBINATIONS:
         if combo.issubset(p1):
             self.winning_player = 1
             self.game_over()
     p2 = set(np_where(self._tiles.flat == 2)[0])
     for combo in self.WINNING_COMBINATIONS:
         if combo.issubset(p2):
             self.winning_player = 2
             self.game_over()
Exemplo n.º 4
0
Arquivo: ttt.py Projeto: vhalis/2048
 def normalize_board(self, player_to_move, reshape=False):
     x = self._tiles.flat
     tile_copy = np_where((x == 0) | (x == player_to_move), x, -1)
     if reshape:
         return tile_copy
     else:
         return tile_copy.ravel()
 def rle_encode(self, img: np_ndarray):
     '''
     img: numpy array, 1 - mask, 0 - background
     Returns run length as string formated
     '''
     pixels = img.flatten()
     pixels = np_concatenate([[0], pixels, [0]])
     runs = np_where(pixels[1:] != pixels[:-1])[0] + 1
     runs[1::2] -= runs[::2]
     return ' '.join(str(x) for x in runs)
def process_df(df):
    grouping_columns = ["event_cd", "start_bases_cd"]
    target_column = "end_bases_cd"

    ana_df = (df.query("inn_ct<=8").assign(
        event_cd=lambda row: np_where(row.event_cd == 2, 3, row.event_cd)
    ).assign(
        event_cd=lambda row: np_where(row.event_cd == 16, 14, row.event_cd)))

    count_df = (ana_df.groupby(grouping_columns + [target_column]).size().
                to_frame().reset_index()).rename({0: "event_ct"}, axis=1)

    return count_df.assign(
        start_first_base=lambda row: (row.start_bases_cd & 1).astype("bool"),
        start_second_base=lambda row: (row.start_bases_cd & 2).astype("bool"),
        start_third_base=lambda row: (row.start_bases_cd & 4).astype("bool"),
        end_first_base=lambda row: (row.end_bases_cd & 1).astype("bool"),
        end_second_base=lambda row: (row.end_bases_cd & 2).astype("bool"),
        end_third_base=lambda row: (row.end_bases_cd & 4).astype("bool"),
    )
Exemplo n.º 7
0
def process_df(df):
    batting_events_df = (df.query(
        "event_cd>=20 or event_cd==14 or event_cd==16 or event_cd<=3").assign(
            event_cd=lambda row: np_where(row.event_cd == 2, 3, row.event_cd)).
                         assign(event_cd=lambda row: np_where(
                             row.event_cd == 16, 14, row.event_cd)))

    agg_df = batting_events_df.groupby("bat_lineup_id").agg({"n": "sum"})

    batting_event_prob_df = (batting_events_df.merge(
        agg_df, on="bat_lineup_id").assign(
            z=lambda row: row.n_x / row.n_y).sort_values(
                ["bat_lineup_id",
                 "event_cd"]).groupby(["bat_lineup_id",
                                       "event_cd"]).sum().query("event_cd>3"))

    b = batting_event_prob_df.reset_index().pivot(index="bat_lineup_id",
                                                  columns="event_cd",
                                                  values="z")
    b.columns = ["base_on_balls", "single", "double", "triple", "home_run"]
    return b
    def mask_to_bbox(self, mask: np_ndarray):
        # Step 1 - Find the coordinates that the mask has a value different than 0
        # The result will be 2 array arr_x = [x1, x2, .. xn] & arr_y = [y1, y2, .., yn]
        # The arrays are matched like: (x1, y1), (x2, y2), .., (xn, yn)
        arr_y, arr_x = np_where(mask != 0)

        # Step 2 - Find the minima and the maxima of the 2 arrays
        y_min = np_amin(arr_y)
        y_max = np_amax(arr_y)
        x_min = np_amin(arr_x)
        x_max = np_amax(arr_x)
        return y_min, y_max, x_min, x_max
Exemplo n.º 9
0
def remove_same_rows(n_pos, X_neg, X_pos, neg_comp_list):
    # Removing negative feature rows that exactly match any row in positives
    cout = 0
    for ind in range(n_pos):
        matching_inds = np_where((X_neg == X_pos[ind]).all(axis=1))
        X_neg = np_delete(X_neg, matching_inds, axis=0)
        for index in sorted(list(matching_inds[0]), reverse=True):
            cout += 1
            del neg_comp_list[index]
    print("No. of negs removed due to same feature vector = ", cout)
    n_neg = len(X_neg)
    return X_neg, neg_comp_list, n_neg
Exemplo n.º 10
0
def svds(a, k=6, tol=0):
    if a.ndim != 2:
        raise ValueError("expect a matrix")
    n, p = a.shape
    comp_right = False
    if n > p:
        comp_right = True

    x_prod = None
    if (issparse(a) and comp_right):
        size = p
        multiply = LinearOperator(matvec=lambda v: a.T.dot(a.dot(v)),
                shape=(p, p))
    elif (issparse(a)):
        size = n
        multiply = LinearOperator(matvec=lambda v: a.dot(a.T.dot(v)),
                shape=(n, n))
    elif (comp_right):
        size = p
        x_prod = np_array(a.T.dot(a))
        multiply = LinearOperator(matvec=lambda v: x_prod.dot(v),
                shape=(p, p))
    else:
        size = n
        x_prod = np_array(a.dot(a.T))
        multiply = LinearOperator(matvec=lambda v: x_prod.dot(v),
                shape=(n, n))

    if (x_prod is not None and (size < 100 or k >= size / 2)):
        x_prod = np_array(x_prod)
        vals, vecs = linalg.eigh(x_prod)
        vals = vals[::-1][0:k]
        vecs = vecs[:,::-1][:,0:k]
    else:
        vals, vecs = eigsh(multiply, k=k, tol=tol)

    def rescale(x):
        x.set_cached(True)
        scal = fp_sqrt(np_sum(x * x, axis=0))
        return x.mapply_rows(scal, fp_bop_div)

    if (comp_right):
        v = fp_array(vecs)
        u = rescale(a.dot(vecs))
    else:
        u = fp_array(vecs)
        v = rescale(a.T.dot(vecs))
    s = np_where(vals > 0, np_sqrt(vals), 0)
    return u, s, v.T
Exemplo n.º 11
0
    def filter(self, **kwargs):

        # if empty than return self (already empty)
        if self.channel.size == 0:
            return self

        HRW_new = deepcopy(self)

        for key_filter in [
                'min_correlation', 'min_conf_nwp', 'min_conf_no_nwp',
                'cloud_type', 'level'
        ]:
            if key_filter in kwargs.keys():

                # if argument given is None or all keyword than skip this filter
                if kwargs[key_filter] == None or kwargs[
                        key_filter] == 'all' or kwargs[
                            key_filter] == 'ALL' or kwargs[key_filter] == 'A':
                    continue

                n1 = str(HRW_new.channel.size)

                if key_filter == 'min_correlation':
                    inds = np_where(HRW_new.correlation > kwargs[key_filter])
                elif key_filter == 'min_conf_nwp':
                    inds = np_where(HRW_new.conf_nwp > kwargs[key_filter])
                elif key_filter == 'min_conf_no_nwp':
                    inds = np_where(HRW_new.conf_no_nwp > kwargs[key_filter])
                elif key_filter == 'cloud_type':
                    mask = np_in1d(HRW_new.cloud_type, kwargs[key_filter])
                    inds = np_where(mask)[0]
                elif key_filter == 'level':
                    if kwargs[
                            key_filter] == 'H':  # high level: < 440hPa like in the ISCCP
                        inds = np_where(HRW_new.pressure < 44000)
                    elif kwargs[
                            key_filter] == 'M':  # mid level: 440hPa ... 680hPa like in the ISCCP
                        inds = np_where(
                            np_logical_and(44000 < HRW_new.pressure,
                                           HRW_new.pressure < 68000))
                    elif kwargs[
                            key_filter] == 'L':  # low level: > 680hPa like in the ISCCP
                        inds = np_where(68000 < HRW_new.pressure)

                HRW_new.subset(inds)
                print "    filter for " + key_filter + " = ", kwargs[
                    key_filter], ' (' + n1 + '->' + str(
                        HRW_new.channel.size) + ')'

        return HRW_new
Exemplo n.º 12
0
    def deConvolve(self,G_w,noise_dT=3,noise_avg=3,fMax=2.4):
        self.reGrid(noise_dT=noise_dT,noise_avg=noise_avg)
        self.tPumpDeconv=np_arange(np_amin(self.tPump),np_amax(self.tPump),
                                   self.tTHz[1]-self.tTHz[0])
        loc=np_amin(np_where(self.f >= fMax))
        for i in range(self.tPumpSkew.size):
            self.dTSkewFFT[i,:loc]=self.dTSkewFFT[i,:loc]/G_w[:loc]
            self.avgSkewFFT[i,:loc]=self.avgSkewFFT[i,:loc]/G_w[:loc]

        self.dTskew=np_irfft(self.dTSkewFFT,axis=1)
        self.avgSkewFFT=np_irfft(self.avgSkewFFT,axis=1)

        self.dTdeconv=unSkew(self.tTHz,self.tPump,self.tPumpSkew,self.dTskew)
        self.avgDeconv=unSkew(self.tTHz,self.tPump
                              ,self.tPumpSkew,self.avgSkewFFT)
        self.refDeconv=self.avgDeconv-self.dTdeconv
        self.pumpDeconv=self.avgDeconv+self.dTdeconv

        self.refFFTdeconv=np_rfft(self.refDeconv,axis=1)
        self.pumpFFTdeconv=np_rfft(self.pumpDeconv,axis=1)
        self.transDeconv=self.pumpFFTdeconv/self.refFFTdeconv
                                
        return
Exemplo n.º 13
0
def sample(bins, time, value):
    """
    Given value[i] was observed at time[i],
    group them into bins i.e.,
    *(bins[j], bins[j+1], ...)*

    Values for bin j are equal to the
    average of all value[k] and,
    bin[j] <= time[k] < bin[j+1].

    __Arguments__
    bins: _np.array_
        Endpoints of the bins.
        For n bins it shall be of length n + 1.
    t: _np.array_
        Times at which the values are observed.
    vt: _np.array_
        Values for those times.

    __Returns__
    x: _np.array_
        Endspoints of all the bins.
    y: _np.array_
        Average values in all bins.
    """
    bin_idx = np_digitize(time, bins) - 1
    value_sums = np_zeros(shape=len(bins) - 1, dtype=np_float32)
    value_cnts = np_zeros(shape=len(bins) - 1, dtype=np_float32)
    np_add.at(value_sums, bin_idx, value)
    np_add.at(value_cnts, bin_idx, 1)
    # ensure graph has no holes
    zeros = np_where(value_cnts == 0)
    assert value_cnts[0] > 0
    for z in zeros:
        value_sums[z] = value_sums[z - 1]
        value_cnts[z] = value_cnts[z - 1]
    return bins[1:], value_sums / value_cnts
Exemplo n.º 14
0
    def filter(self, **kwargs):

        # if empty than return self (already empty)
        if self.channel.size == 0:
            return self

        HRW_new = deepcopy(self)

        for key_filter in ['min_correlation', 'min_conf_nwp', 'min_conf_no_nwp', 'cloud_type', 'level']:
            if key_filter in kwargs.keys():
                
                # if argument given is None or all keyword than skip this filter 
                if kwargs[key_filter] == None or kwargs[key_filter] == 'all' or kwargs[key_filter] == 'ALL' or kwargs[key_filter] == 'A':
                    continue

                n1 = str(HRW_new.channel.size)

                if key_filter == 'min_correlation':
                    inds = np_where(HRW_new.correlation > kwargs[key_filter])
                elif key_filter == 'min_conf_nwp':
                    inds = np_where(HRW_new.conf_nwp    > kwargs[key_filter])
                elif key_filter == 'min_conf_no_nwp':
                    inds = np_where(HRW_new.conf_no_nwp > kwargs[key_filter])
                elif key_filter == 'cloud_type':
                    mask = np_in1d(HRW_new.cloud_type, kwargs[key_filter]) 
                    inds = np_where(mask)[0]
                elif key_filter == 'level':
                    if kwargs[key_filter] == 'H': # high level: < 440hPa like in the ISCCP
                        inds = np_where(HRW_new.pressure < 44000 ) 
                    elif kwargs[key_filter] == 'M': # mid level: 440hPa ... 680hPa like in the ISCCP
                        inds = np_where( np_logical_and(44000 < HRW_new.pressure, HRW_new.pressure < 68000) ) 
                    elif kwargs[key_filter] == 'L': # low level: > 680hPa like in the ISCCP
                        inds = np_where(68000 < HRW_new.pressure)

                HRW_new.subset(inds)
                print "    filter for "+key_filter+" = ", kwargs[key_filter],' ('+n1+'->'+str(HRW_new.channel.size)+')'

        return HRW_new
Exemplo n.º 15
0
def sig_to_eps(f, sigma):
    sigma = np_conjugate(sigma[np_where(f != 0)])
    f = f[np_where(f != 0)]
    w = 2 * np_pi * f * 1e12
    return f, 1 + 1j * sigma / (w * epsilon0)
Exemplo n.º 16
0
def generate_intermediate_result(division):
    students = Student.objects.filter(division=division).exclude(vacant=True)
    results = []
    # Temporary
    student = Student.objects.first()
    students = [student]
    #####
    for index, student in enumerate(students):
        ass = Assessment.objects.filter(student=student).values(
            "exam__name", "exam__subject__name", "marks", "note")
        df = pd.DataFrame(ass)
        df.columns = ["exam", "subject", "marks", "note"]
        df["note"] = df["note"].replace({"ABSENT": "AB", "BLANK": "BLK"})
        df["marks"] = np_where(df["marks"] == -1, df["note"], df["marks"])
        df = df.drop(["note"], axis=1)

        theory = df[df["exam"] == "final_theory"].drop("exam", axis=1)
        oral = df[df["exam"] == "final_oral"].drop("exam", axis=1)
        final = theory.merge(oral, on="subject", suffixes=["_theory", "_oral"])
        final["marks"] = final["marks_theory"].map(
            str) + " + " + final["marks_oral"].map(str)
        final = final.drop(["marks_theory", "marks_oral"], axis=1)
        final["exam"] = "final"

        # Remove final_theory, final_oral
        df = df.drop(df[df["exam"].str.startswith("final")].index)
        ready_for_tabulation = pd.concat([df, final])

        data = []
        for s in ready_for_tabulation["subject"].unique():
            row = [s]
            sub_data = ready_for_tabulation[ready_for_tabulation["subject"] ==
                                            s]
            for e in ["unit_one", "terminal", "unit_two", "final"]:
                row.append(sub_data[sub_data["exam"] == e].marks.values[0])
            data.append(row)

        df = pd.DataFrame(
            data, columns=["Subject", "Unit 1", "Terminal", "Unit 2",
                           "Final"]).set_index("Subject")

        df["Total"] = df.apply(get_sub_total, axis=1)
        df["%"] = ceil(df["Total"] / 2)
        df["%"] = df["%"].round(2)

        ld = student.identifier == "LD"
        sports = student.sports > 0
        if sports:
            sports_marks = s.sportsdata_set.aggregate(
                Max("extra_marks")).get("extra_marks__max")
        else:
            sports_marks = 0

        result = apply_grace_marks(df,
                                   ld=ld,
                                   sports=sports,
                                   sports_marks=sports_marks)

        df = result["df"]
        status = result["status"]
        sports_remain_marks = result["sports_remain_marks"]

        if status == "FAIL":
            df.drop("Grace Marks", axis=1, inplace=True)
        else:
            uni = df["Grace Marks"].unique()
            if len(uni) == 1 and uni[0] == 0:
                df.drop("Grace Marks", axis=1, inplace=True)

        df.index.name = None
        total_marks = int(ceil(df.Total / 2).sum())

        # Add EVS
        evs_marks = PT_EVS_Assessment.objects.filter(
            student=student,
            exam__subject="EVS").aggregate(Sum("marks"))["marks__sum"]
        df = df.append(pd.Series({}, name="EVS", dtype=int))
        df.loc["EVS", "%"] = evs_marks
        df = df.fillna("")

        total_marks += evs_marks

        exam_conducted_of_marks = (len(df) - 1) * 100
        exam_conducted_of_marks += 50

        if sports:
            marks = " ".join([str(total_marks), "+", str(sports_remain_marks)])
            final_perc = round(
                (total_marks + sports_remain_marks) / exam_conducted_of_marks,
                2)  # CHECK, Round UP or Round Down
        else:
            marks = total_marks
            final_perc = round(total_marks / exam_conducted_of_marks, 2)

        final_perc *= 100

        # PT Grade
        pt_marks = PT_EVS_Assessment.objects.filter(
            student=student,
            exam__subject="PT").aggregate(Sum("marks"))["marks__sum"]
        pt_grade = get_pt_grade(pt_marks)

        non_int_color_cols = ["Unit 1", "Terminal", "Unit 2", "Final"]
        subjects = list(df.index)
        subjects.remove("EVS")
        html = df.style.applymap(
            color_cell, subset=(subjects, non_int_color_cols)).applymap(
                color_sub_perc, subset=(subjects, ["%"])).set_table_attributes(
                    "class='dataframe mystyle'").set_precision(2).render()

        env = Environment(loader=FileSystemLoader('./result'))
        template = env.get_template("student_result.html")
        template_vars = {
            "title": "Result",
            "name": student.name,
            "roll": student.roll_num,
            "division": division,
            "marks": marks,
            "final_perc": final_perc,
            "result": html,
            "status": status,
            "sports_remain_marks": sports_remain_marks,
            "sports": sports,
            "ld": ld,
            "pt_grade": pt_grade
        }

        html_out = template.render(template_vars)
        fp = "./student_results/{}/roll_{}.html".format(
            division, str(student.roll_num))
        default_storage.delete(fp)
        f = tempfile.TemporaryFile(mode="w+")
        f.write(html_out)
        default_storage.save(fp, f)
        print(fp)

        template_vars["id"] = index

        results.append(template_vars)

    env = Environment(loader=FileSystemLoader('./result'))
    template = env.get_template("inter_result.html")
    template_vars = {
        "title": "College Result",
        "division": division,
        "results": results
    }
    html_out = template.render(template_vars)
    file_path = "./results/intermediate_result.html"
    default_storage.delete(file_path)
    f = tempfile.TemporaryFile(mode="w+")
    f.write(html_out)

    file_name = default_storage.save(file_path, f)
    return file_path
Exemplo n.º 17
0
    def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature.
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations : int
            Iterations of clustering to perform.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('')
        self.logger.info('  Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.iteritems():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in xrange(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of tetranucleotide signatures
        if K != 0:
            if not no_pca:
                self.logger.info('  Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info('    First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100))
    
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('  Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            print '  Whitening data.'
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info('  Partitioning genome into %d clusters.' % num_clusters)

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('    Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
Exemplo n.º 18
0
    def kmeans(self, scaffold_stats, num_clusters, num_components, K,
               no_coverage, no_pca, iterations, genome_file, output_dir):
        """Cluster genome with k-means.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations: int
            iterations to perform during clustering
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in range(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of signatures
        if K != 0:
            if not no_pca:
                self.logger.info('Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info(
                    'First {:,} PCs capture {:.1f}% of the variance.'.format(
                        num_components,
                        sum(variance[0:num_components]) * 100))

                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            self.logger.info('Whitening data.')
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info(
            'Partitioning genome into {:,} clusters.'.format(num_clusters))

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats,
                                             num_clusters,
                                             iterations,
                                             minit='points',
                                             missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('Placed {:,} sequences in cluster {:,}.'.format(
                sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(
                os.path.join(output_dir,
                             genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()