예제 #1
0
def run_greedy(kToSelect, parallel):
    # get starting set of data
    predictors = [([], -1e10)]
    # loop through predictors and at each step,
    # add one predictor that increases R2 the most
    # and calculate R2
    for k in range(kToSelect):
        logging.info(k)
        best_k_predictors = predictors[-1][0]

        predictor_list = list(set(all_predictors) - set(best_k_predictors))

        def greedy_helper(predictor):
            k_plus_1 = list(best_k_predictors + [predictor])
            x_train = X1[:, k_plus_1]
            return get_class_rate(x_train, y_cat)

        if parallel:
            r2 = Parallel(n_jobs=-1,
                          verbose=50)(delayed(greedy_helper)(predictor)
                                      for predictor in predictor_list)
        else:
            r2 = []
            for predictor in predictor_list:
                r2.append(greedy_helper(predictor))

        best_k_plus_1 = best_k_predictors + [predictor_list[np.argmax(r2)]]
        predictors.append((best_k_plus_1, np.max(r2)))
        logging.info("%s %s %s" %
                     (str(k), str(best_k_plus_1), str(np.max(r2))))
    return predictors
def pairwise_jaccard_similarity(set_per_row):
#    print (set_per_row)
    
    results = Parallel(
        n_jobs=-1,backend='threading'
    )(
            delayed(jaccard_similarity)(i,j,set_per_row)
                for i in range(set_per_row.shape[0])
                for j in range(set_per_row.shape[0])
    )
    
    results = np.array(results)
    
    return results.reshape((set_per_row.shape[0],set_per_row.shape[0]))
예제 #3
0
    def transform(self, X):
        """Transform a count matrix to ..
        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts

        Returns
        -------
        vectors : sparse matrix, [n_samples, n_features]
        sample's time-to-transform : sparse matrix, [n_samples] 
        """
        if self.selection_size == None:
            raise ('must be fitted!')
        else:
            approach_all_finger = np.empty(
                (X.shape[0], self.selection_size * self.n_permutations),
                np.int)
            approach_all_time = np.zeros((approach_all_finger.shape))
            r = np.array(
                Parallel(
                    n_jobs=self.n_jobs, backend="multiprocessing",
                    verbose=0)(delayed(self.measure_selection_function)(
                        X[i, :].T, self.items_to_permute, self.n_permutations,
                        self.selection_size, self.selection_function)
                               for i in range(X.shape[0])))

            approach_all_finger = np.vstack(r[:, 0])
            approach_all_time = np.vstack(r[:, 1])

            return (approach_all_finger, approach_all_time)
예제 #4
0
    def _find_best_w(self, x, y, random_state):
        xa = x[y == self.a]
        xb = x[y == self.b]

        points = [
            self.__get_random_point_on_sphere(x.shape[1], random_state)
            for _ in range(self.n_iters)
        ]
        if self.use_svc_weights:
            points.append(
                SVC(kernel='linear',
                    C=1,
                    random_state=random_state,
                    class_weight='auto').fit(x, y).coef_[0])

        rets = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
            delayed(maximize_entropy)(xa, xb, point, self.method,
                                      self.on_sphere, self.base_objective)
            for point in points)

        best_i = 0
        for i in range(len(points)):
            if self.verbose:
                print "Entropy with gamma {} starting from point[{}] = {}".format(
                    self.base_objective.gamma, i, -rets[i]["fun"])
            if rets[i]["fun"] < rets[best_i]["fun"]:
                best_i = i
        return rets[best_i]["x"]
예제 #5
0
def run_greedy(kToSelect, parallel=True):
    # get starting set of data
    predictors = [([], 0, 0)]
    # loop through predictors and at each step,
    # add one predictor that increases obj function the most
    # and calculate obj function
    for k in range(kToSelect):

        best_k_predictors = predictors[-1][0]
        r2 = []

        predictor_list = list(set(all_predictors) - set(best_k_predictors))
        if not parallel:
            for predictor in predictor_list:
                k_plus_1 = list(best_k_predictors + [predictor])

                if 'bayes' in dat:
                    r2.append(obj_fun(k_plus_1))
                else:
                    x_train = X[:, k_plus_1]
                    r2.append(obj_fun(x_train, y_cat))

        else:
            r2 = Parallel(n_jobs=-1)(
                delayed(evaluate_oracle)(best_k_predictors, a)
                for a in predictor_list)

        best_k_plus_1 = best_k_predictors + [predictor_list[np.argmax(r2)]]

        end = time.time()
        timePassed = end - start

        predictors.append((best_k_plus_1, np.max(r2), timePassed))

    return predictors
예제 #6
0
def amortizedFilter(k, r, ep, OPT, X, debug=True, parallel=False):

    m = 10
    S = []
    y_adap = []
    for i in range(r):
        T = []
        logging.info('r=' + str(i))

        fS = oracle(S)
        fST = oracle(union(S, T))

        while ((fST - fS) < (ep / 20) * (OPT - fS)) and (len(union(S, T)) < k):

            # FILTER Step
            # this only changes X
            vs = estimateSet(X, union(S, T), m)
            while (vs < (1 - ep) * (OPT - fST) / r):
                if debug:
                    logging.info('inner while loop')

                # get marginal contribution
                if parallel:
                    marg_a = Parallel(n_jobs=-1, verbose=50)(
                        delayed(estimateMarginal)(X, union(S, T), a, m)
                        for a in X)
                else:
                    marg_a = [
                        estimateMarginal(X, union(S, T), a, m) for a in X
                    ]

                # Filter!
                Xnew = [
                    X[idx] for idx, el in enumerate(marg_a)
                    if el >= (1 + ep / 2) * (1 - ep) * (OPT - fST) / k
                ]
                X = Xnew

                # estimate if filtered set is good enough
                vs = estimateSet(X, union(S, T), m)
                if debug:
                    logging.info('Elements remaining: ' + str(len(X)))
                    logging.info('Check')
                    logging.info(vs < (1 - ep) * (OPT - fST) / r)

            R = randomSample(X, k / r)
            T = union(T, R)

            # T changes but S doesn't
            fST = oracle(union(S, T))

            if debug:
                logging.info('Outer Loop')
                logging.info(fST)

        S = union(S, T)
        fS = oracle(S)
        y_adap.append((len(S), fS))
    return y_adap
예제 #7
0
def extract_all_features(data, n_jobs=-1):
    """
    Function extracting all available features to a numpy feature array.
    :param data: iterable containing domain name strings
    :return: feature matrix as numpy array
    """
    parallel = Parallel(n_jobs=n_jobs, verbose=1)
    feature_matrix = parallel(
        delayed(extract_features)(d, ALL_FEATURES) for d in data)
    return np.array(feature_matrix)
예제 #8
0
def search_light(X,
                 y,
                 estimator,
                 A,
                 score_func=None,
                 cv=None,
                 n_jobs=-1,
                 verbose=0):
    """Function for computing a search_light

    Parameters
    ----------
    X: array-like of shape at least 2D
        The data to fit.

    y: array-like
        The target variable to try to predict.

    estimator: estimator object implementing 'fit'
        The object to use to fit the data

    A : sparse matrix.
        adjacency matrix. Defines for each sample the neigbhoring samples
        following a given structure of the data.

    score_func: callable, optional
        callable taking as arguments the fitted estimator, the
        test data (X_test) and the test target (y_test) if y is
        not None.

    cv: cross-validation generator, optional
        A cross-validation generator. If None, a 3-fold cross
        validation is used or 3-fold stratified cross-validation
        when y is supplied.

    n_jobs: integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose: integer, optional
        The verbosity level. Defaut is 0

    Returns
    -------
    scores: array-like of shape (number of rows in A)
        search_light scores
    """
    scores = np.zeros(len(A.rows), dtype=float)
    group_iter = GroupIterator(A.shape[0], n_jobs)
    scores = Parallel(n_jobs=n_jobs, verbose=verbose)(delayed(
        _group_iter_search_light)(list_i, A.rows[list_i], estimator, X, y,
                                  A.shape[0], score_func, cv, verbose)
                                                      for list_i in group_iter)
    return np.concatenate(scores)
예제 #9
0
    def fit(self, X, y=None, sample_weight=None, exposure=None):
        # For later
        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                        pre_dispatch=self.pre_dispatch,
                        max_nbytes=None)
        
        # Extract arguments
        fit_args = self._process_args(X=X, y=y, sample_weight=sample_weight,
                                      exposure=exposure)
        
        # Sort out cv parameters
        if self.cv == 1:
            cv = no_cv(X=X, y=y)
        else:
            if hasattr(self.cv, 'split'):
                cv_args = dict(X=X)
                if y is not None:
                    cv_args['y'] = np.ravel(y)
                cv = self.cv.split(**cv_args)
            else:
                cv_args = dict(X=X)
                if y is not None:
                    cv_args['y'] = shrinkd(1,np.asarray(y))
                cv = check_cv(self.cv, classifier=is_classifier(self.estimator), **cv_args)
                
        # Do the cross validation fits
#         print(valmap(lambda x: x.shape, fit_args))
#         print('num_folds = %d' % self.cv.get_n_splits(X=X))
        cv_fits = parallel(delayed(_fit_and_predict)(clone(self.estimator), fit_args, train, test, self.verbose) for train, test in cv)
        
        # Combine predictions from cv fits
        prediction = np.empty_like(y) if y is not None else np.empty(shape=X.shape[0])
        for fit in cv_fits:
            safe_assign_subset(prediction, fit[2], fit[1])
        
        # Store cross validation models
        self.cv_estimators_ = [fit[0] for fit in cv_fits]
        self.cv_indices_ = [fit[2] for fit in cv_fits]
        self.cv_predictions_ = prediction
        
        # If a metric was provided, compute the score
        if self.metric is not None:
            metric_args = {}
            if 'sample_weight' in fit_args:
                metric_args['sample_weight'] = fit_args['sample_weight']
            if 'exposure' in fit_args:
                metric_args['exposure'] = fit_args['exposure']
            self.score_ = safer_call(self.metric, y, self.cv_predictions_, **metric_args)
        
        # Fit on entire data set
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(**fit_args)
        return self
예제 #10
0
    def fit_linear_nnls(self, X, y, sample_weight=None):
        if not isinstance(self.model, LinearRegression):
            raise ValueError(
                'Model is not linearRegression, could not call fit for linear nnls'
            )
        n_jobs_ = self.model.n_jobs
        self.model.coef_ = []
        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         y_numeric=True,
                         multi_output=True)

        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        X, y, X_offset, y_offset, X_scale = self.model._preprocess_data(
            X,
            y,
            fit_intercept=self.model.fit_intercept,
            normalize=self.model.normalize,
            copy=self.model.copy_X,
            sample_weight=sample_weight)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        if sp.issparse(X):
            if y.ndim < 2:
                # out = sparse_lsqr(X, y)
                out = lsq_linear(X, y, bounds=(0, np.Inf))
                self.model.coef_ = out[0]
                self.model._residues = out[3]
            else:
                # sparse_lstsq cannot handle y with shape (M, K)
                outs = Parallel(n_jobs=n_jobs_)(
                    delayed(lsq_linear)(X, y[:, j].ravel())
                    for j in range(y.shape[1]))
                self.model.coef_ = np.vstack(out[0] for out in outs)
                self.model._residues = np.vstack(out[3] for out in outs)
        else:
            # self.model.coef_, self.model.cost_, self.model.fun_, self.model.optimality_, self.model.active_mask_,
            # self.model.nit_, self.model.status_, self.model.message_, self.model.success_\
            out = lsq_linear(X, y, bounds=(0, np.Inf))
            self.model.coef_ = out.x
            self.model.coef_ = self.model.coef_.T

        if y.ndim == 1:
            self.model.coef_ = np.ravel(self.model.coef_)
        self.model._set_intercept(X_offset, y_offset, X_scale)
        return self.model
예제 #11
0
    def fit(self, X, y=None, sample_weight=None, exposure=None):
        if self.cv == 1:
            cv = no_cv(X=X, y=y)
        else:
            if hasattr(self.cv, 'split'):
                cv = self.cv.split(X, y)
            else:
                cv = check_cv(self.cv, X=X, y=y, classifier=is_classifier(self.calibrator))
        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                        pre_dispatch=self.pre_dispatch)
        
        # Fit the estimator on each train set and predict with it on each test set
        fit_args = {'X': X}
        if y is not None:
            fit_args['y'] = y
        if self.est_weight and sample_weight is not None:
            fit_args['sample_weight'] = sample_weight
        if self.est_exposure and exposure is not None:
            fit_args['exposure'] = exposure
        
        # Do the cross validation fits
        cv_fits = parallel(delayed(_fit_and_predict)(clone(self.estimator), fit_args, train, test) for train, test in cv)
        
        # Combine predictions from cv fits
        prediction = np.empty_like(y)
        for fit in cv_fits:
            safe_assign_subset(prediction, fit[2], fit[1])
        
#         fit_predict_results = parallel(delayed(_fit_and_predict)(estimator=clone(self.estimator),
#                                        train=train, test=test, **fit_args) for train, test in cv)
#         
#         # Combine the predictions
#         prediction = np.empty_like(y)
#         for _, pred, _, test in zip(fit_predict_results, cv):
            
#         prediction = np.concatenate([pred for _, pred in cv_fits], axis=0)
        
        # Fit the calibrator on the predictions
        cal_args = {'X': prediction[:, None] if len(prediction.shape) == 1 else prediction, 
                    'y': y}
        if self.cal_weight and sample_weight is not None:
            cal_args['sample_weight'] = sample_weight
        if self.cal_exposure and exposure is not None:
            cal_args['exposure'] = exposure
        self.calibrator_ = clone(self.calibrator).fit(**cal_args)
        
        # Fit the estimator on the entire data set
        self.estimator_ = clone(self.estimator).fit(**fit_args)
        
        return self
예제 #12
0
def classify_pcaps(path_to_pcaps_folder, clf: Clf, n_jobs=2):
    pcaps = []
    for (path, dirs, files) in walk(path_to_pcaps_folder):
        pcaps.extend(files)
        break

    pcaps = [path_to_pcaps_folder + '/' + pcap for pcap in pcaps if not pcap.endswith('.txt')]

    parallel = Parallel(n_jobs=n_jobs, verbose=1)

    parallel(
        delayed(_classify_pcap)(pcap, clf)
        for pcap in pcaps
        )
예제 #13
0
def example_simple_function(self, active_process_path):
    """ simple function to calculate sqrt
  """
    active_process = self.portal_activities.unrestrictedTraverse(
        active_process_path)

    # Use CMFActivity as a backend for joblob
    with parallel_backend('CMFActivity', active_process=active_process):
        result = Parallel(n_jobs=2, pre_dispatch='all', timeout=30,
                          verbose=30)(delayed(sqrt)(i**2) for i in range(5))

    # Set result value and an id to the active result and post it
    result = ActiveResult(result=result)
    active_process.postResult(result)
    log("joblib activity result", result)
    return result
예제 #14
0
파일: melm.py 프로젝트: codeaudit/melm
    def _find_best_w(self, x, y, random_state):

        points = [self.__get_random_point_on_sphere(x.shape[1], random_state) for _ in range(self.n_starts)]
        if self.use_pca_weights:
            pca = decomposition.PCA(n_components=int(self.base_objective.k))
            pca.fit(x)
            points[-1] = pca.components_

        rets = Parallel(n_jobs=self.n_jobs)(
            delayed(maximize_entropy)(x, y, point, self.method, self.on_manifold, self.base_objective)
            for point in points)

        best_i = 0
        for i in range(len(points)):
            if rets[i]["fun"] < rets[best_i]["fun"]:
                best_i = i

        return rets[best_i]["x"].reshape(-1, self.base_objective.k).T
def parallel_predict(estimator,
                     X,
                     n_jobs=1,
                     method='predict',
                     batches_per_job=3):
    """
    Run sklearn classifier prediction in parallel.
    """
    n_jobs = max(cpu_count() + 1 + n_jobs,
                 1)  # XXX: this should really be done by joblib
    n_batches = batches_per_job * n_jobs
    n_samples = len(X)
    batch_size = int(np.ceil(n_samples / n_batches))
    parallel = Parallel(n_jobs=n_jobs, backend="threading")
    results = parallel(
        delayed(_predict, check_pickle=False)(estimator, X, method, i, i +
                                              batch_size)
        for i in range(0, n_samples, batch_size))
    return np.concatenate(results)
예제 #16
0
def generate_mappings_for_genes(genes_of_interest, batch_size, use_parallel):
    # filter gene names alreay present in the database
    genes_of_interest = filter_gene_names_present_in_database(
        genes_of_interest)

    # Create batches
    genes_of_interest_batches = [
        genes_of_interest[i:i + batch_size]
        for i in range(0, len(genes_of_interest), batch_size)
    ]
    n_batches = len(genes_of_interest_batches)
    n_genes = len(genes_of_interest)

    # Annotate the genes in batches
    _log.info("Starting the mapping of batched analysis of '" + str(n_genes) +
              "' over '" + str(n_batches) + "' batches")
    succeeded_genes = 0
    for batch_counter, gene_batch in enumerate(genes_of_interest_batches):
        _log.info("Starting batch '" + str(batch_counter + 1) + "' out of '" +
                  str(n_batches) + "', with '" + str(len(gene_batch)) +
                  "' genes")

        gene_mappings = []
        if use_parallel:
            gene_mappings = Parallel(
                n_jobs=CalculateNumberOfActiveThreads(batch_size))(
                    delayed(generate_gene_to_swissprot_mapping)(gene)
                    for gene in gene_batch)
        else:
            gene_mappings = [
                generate_gene_to_swissprot_mapping(gene) for gene in gene_batch
            ]

        # add the batches to the database
        for gene_mapping in gene_mappings:
            add_gene_mapping_to_database(gene_mapping)
            succeeded_genes += 1

        _log.info("Finished batch '" + str(batch_counter + 1) + "' out of '" +
                  str(n_batches) + "'")
    _log.info("Finished the mapping of batched analysis of '" + str(n_genes) +
              "' over '" + str(n_batches) + "' batches, resulting in '" +
              str(succeeded_genes) + "' successful gene mappings")
예제 #17
0
파일: pairwise.py 프로젝트: demonSong/DML
def _parallel_pairwise(X, Y, func, n_jobs, **kwds):

    if n_jobs < 0:
        n_jobs = max(cpu_count() + 1 + n_jobs, 1)

    if Y is None:
        Y = X

    if n_jobs == 1:
        # Special case to avoid picklability checks in delayed
        return func(X, Y, **kwds)

    # TODO: in some cases, backend='threading' may be appropriate
    fd = delayed(func)
    ret = Parallel(n_jobs=n_jobs,
                   verbose=0)(fd(X, Y[s], **kwds)
                              for s in gen_even_slices(Y.shape[0], n_jobs))

    return np.hstack(ret)
예제 #18
0
    def predict_proba(self, X, π=40):
        """For each x in X, provide vector of probs for each class.
            :param X: data, a sequence of URLs 
            :param π: int, threshold for parallelizing. Below this don't bother.
        
        URLs: JPL7 model assumes`get_html(url)` will retrieve HTML as required.
        
        Parallel logic modified from QingKaiKong. Also viewed pomegranate and scikit-issues.
            * http://qingkaikong.blogspot.com/2016/12/python-parallel-method-in-class.html
            * https://github.com/jmschrei/pomegranate/blob/master/pomegranate/parallel.pyx
            * https://github.com/scikit-learn/scikit-learn/issues/7448
        
        """
        # check_is_fitted(self, ['X_', 'y_'])
        # X = check_array(X)
        if type(X) is str:
            raise (AttributeError,
                   "predict_proba: X must be array-like, not string!")

        n, n_jobs = len(X), self.n_jobs
        if n < π:
            n_jobs = 1  # Not worth the overhead to parallelize
            batches = X
        else:
            starts = [i * n // n_jobs for i in range(n_jobs)]
            ends = starts[1:] + [n]
            batches = [X[start:end] for start, end in zip(starts, ends)]

        t0 = arrow.now()
        if n_jobs > 1:
            score = delayed(batch_score_urls)
            with Parallel(n_jobs=n_jobs) as parallel:
                results = parallel(score(batch, self) for batch in batches)
        else:
            results = (batch_score_urls(batch, self) for batch in batches)
        results = np.concatenate(
            [np.array([row for row in batch]) for batch in results])
        dt = arrow.now() - t0

        logging.info('TIMING: n_jobs = %d, t = %s, dt = **%3.3fs**' %
                     (n_jobs, t0.format('HH:mm:ss'), dt.total_seconds()))
        return results
예제 #19
0
    def fit(self, X, y_clf, y_regression):
        """
        Fit the multiclass model.

        Parameters
        ----------
        X : numpy array of shape [n_samples,n_features]
            Training data
        y_clf : numpy array of shape [n_samples]
            Target classes for classification model
        y_regression: numpy array of shape [n_samples]
            Target values for regression model 
            
        Returns
        -------
        self : returns an instance of self.
        """

        X = safe_asarray(X)
        y_clf = np.asarray(y_clf)
        y_regression = np.asarray(y_regression)

        self.clf_model = self.clf.fit(X, y_clf)

        classes = set(y_clf)
        regr = self.regr

        def _generator():
            for class_ in classes:
                examples = y_clf == class_
                yield class_, X[examples], y_regression[examples], regr

        out = Parallel(self.n_jobs, self.verbose, self.pre_dispatch)(\
                delayed(_fit_helper)(*params) for params in _generator())

        self.regression_models = {}
        for class_, regr_model in out:
            self.regression_models[class_] = regr_model

        return self
예제 #20
0
    def predict_all_plain(self, domains, n_jobs=-1):
        """
        Predicts the label of d using all classifiers present
        :param domains: iterable containing domains as str
        :return: dictionry containing results (dga, svm/rf):label
        """
        feature_matrix = extract_all_features(domains)

        parallel = Parallel(n_jobs=n_jobs, verbose=1)

        res = parallel(
            delayed(_predict)(c, feature_matrix)
            for c in self.clfs
        )
        # TODO
        merged = {}
        for d in res:
            keys = list(d.keys())
            if keys[0] in res:
                merged[keys[0]] += d[keys[0]]
            else:
                merged.update(d)
        return merged
예제 #21
0
    def predict(self, X, return_class_prediction=False):
        """
        Predict using the muticlass regression model

        Parameters
        ----------
        X : numpy array of shape [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]
            Returns predicted values.
        """

        X = safe_asarray(X)
        y_clf_predicted = np.asarray(self.clf_model.predict(X))
        classes = set(y_clf_predicted)

        def _generator():
            for class_ in classes:
                examples = y_clf_predicted == class_
                yield examples, X[examples], self.regression_models[class_]

        out = Parallel(self.n_jobs, self.verbose, self.pre_dispatch)(\
                delayed(_predict_helper)(*params) for params in _generator())

        y_regr_predicted = None
        for examples, predicted in out:
            if y_regr_predicted is None:
                y_regr_predicted = np.zeros(X.shape[0], predicted.dtype)
            y_regr_predicted[examples] = predicted

        if return_class_prediction:
            return y_clf_predicted, y_regr_predicted
        else:
            return y_regr_predicted
예제 #22
0
def getDomanListFeature(domain_list):
    parallel = Parallel(n_jobs=-1, verbose=1)
    feature_matrix = parallel(
        delayed(getFeature)(d, datetime.datetime.strptime(
            '20180507', "%Y%m%d")) for d in domain_list)
    return feature_matrix
예제 #23
0
def load_sample_as_ir_task(sample_threshold=100, language_filter="ALL"):
    documents, queries, targets_pairs, source_dataframe, susp_dataframe, dataset_encoding = _load_as_ir_task_without_content(
        allow_queries_without_relevants=True, language_filter=language_filter)
    sample_targets_pairs = targets_pairs[
        targets_pairs.query_row_index < sample_threshold]
    sample_document_indexes = list(
        set(sample_targets_pairs.loc[:, 'document_col_index'].tolist()))
    sample_queries_indexes = list(
        set(sample_targets_pairs.loc[:, 'query_row_index'].tolist()))

    col, row = [], []
    for _, sample_target_row in sample_targets_pairs.iterrows():
        new_row_id = sample_queries_indexes.index(
            sample_target_row['query_row_index'])
        new_col_id = sample_document_indexes.index(
            sample_target_row['document_col_index'])
        col.append(new_col_id)
        row.append(new_row_id)


#         print(sample_target_row['document_col_index'],'=',new_col_id)
#         print(sample_target_row['query_row_index'],'=',new_row_id)

    col = list(col)
    row = list(row)
    data = np.ones(len(row))

    dataset_target = coo_matrix(
        (data, (row, col)),
        shape=(len(sample_queries_indexes), len(sample_document_indexes)))
    del col, row, data
    dataset_target = dataset_target.tolil()

    #     print(sample_targets_pairs)
    #     print(sample_targets_pairs.shape,'=',dataset_target.shape,len(row),len(col),len(data))

    documents_content = Parallel(n_jobs=2, backend="threading", verbose=1)(
        delayed(load_file_content)(source_dataframe, source_dataframe.loc[
            documents.loc[source_col_id,
                          'dataframe_index'], 'reference'], dataset_encoding)
        for source_col_id in sample_document_indexes)
    #     print(len(documents_content))
    queries_content = Parallel(n_jobs=2, backend="threading", verbose=1)(
        delayed(load_file_content)(susp_dataframe, susp_dataframe.loc[
            queries.loc[susp_row_id,
                        'dataframe_index'], 'reference'], dataset_encoding)
        for susp_row_id in sample_queries_indexes)
    #     print(len(queries_content))

    documents = pd.DataFrame({
        'dataframe_index':
        documents.loc[sample_document_indexes,
                      'dataframe_index'].values.tolist(),
        'content':
        documents_content
    })
    queries = pd.DataFrame({
        'dataframe_index':
        queries.loc[sample_queries_indexes, 'dataframe_index'].values.tolist(),
        'content':
        queries_content
    })

    del sample_document_indexes, sample_queries_indexes, sample_targets_pairs
    del source_dataframe, susp_dataframe

    del documents_content, queries_content

    #     exit()

    return queries, documents, dataset_target, dataset_encoding
예제 #24
0
def _load_ir_task_content(documents, queries, targets_pairs, source_dataframe,
                          susp_dataframe, dataset_encoding):
    '''
        loading content from files
    '''

    col = targets_pairs.loc[:, 'document_col_index'].values.tolist()
    row = targets_pairs.loc[:, 'query_row_index'].values.tolist()
    data = np.ones(targets_pairs.shape[0])

    dataset_target = coo_matrix((data, (row, col)),
                                shape=(queries.shape[0], documents.shape[0]))
    del col, row, data
    dataset_target = dataset_target.tolil()

    for id_row, source_row in documents.iterrows():
        contenti = load_file_content(
            source_dataframe,
            source_dataframe.loc[source_row['dataframe_index'],
                                 'reference'], dataset_encoding)
        print(id_row, '=>', len(contenti))
        if (len(contenti) < 2):
            print(source_row)
            print(source_dataframe.loc[source_row['dataframe_index'],
                                       'reference'])
            #             print(__file_path(source_dataframe, source_dataframe.loc[source_row['dataframe_index'],'reference']))
            input('empty document!')

    for id_row, susp_row in queries.iterrows():
        contenti = load_file_content(
            susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],
                                               'reference'], dataset_encoding)
        print(id_row, '=>', len(contenti))
        if (len(contenti) < 2):
            print(susp_row)
            print(susp_dataframe.loc[susp_row['dataframe_index'], 'reference'])
            #             print(__file_path(susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],'reference']))
            input('empty query!')

    documents_content = Parallel(n_jobs=2, backend="threading", verbose=1)(
        delayed(load_file_content)(source_dataframe, source_dataframe.loc[
            source_row['dataframe_index'], 'reference'], dataset_encoding)
        for _, source_row in documents.iterrows())
    queries_content = Parallel(n_jobs=2, backend="threading", verbose=1)(
        delayed(load_file_content)(susp_dataframe, susp_dataframe.loc[
            susp_row['dataframe_index'], 'reference'], dataset_encoding)
        for _, susp_row in queries.iterrows())

    del source_dataframe, susp_dataframe

    documents = pd.DataFrame({
        'dataframe_index':
        documents.loc[:, 'dataframe_index'].values.tolist(),
        'content':
        documents_content
    })
    queries = pd.DataFrame({
        'dataframe_index':
        queries.loc[:, 'dataframe_index'].values.tolist(),
        'content':
        queries_content
    })

    del documents_content, queries_content
    return queries, documents, dataset_target, dataset_encoding
예제 #25
0
def load_as_ir_task(allow_queries_without_relevants = True, language_filter = 'ALL'):
    '''
        11093 source documents (index) X 11093 suspicious documents (queries) (allowing queries without relevants)
    '''
    
    path = datasets_extractors['DATASETS_PATH']['pan_plagiarism_corpus_2011']
    if allow_queries_without_relevants:
        files_path = os.path.join(path,"ir_task_PAN11(%s)_allows.h5"%(language_filter))
    else:
        files_path = os.path.join(path,"ir_task_PAN11(%s).h5"%(language_filter))
    
    if os.path.exists(files_path):
        source_dataframe, susp_dataframe,dataset_encoding = load_to_pandas()
        del source_dataframe, susp_dataframe

        #load and return 
        queries = pd.read_hdf(files_path, 'queries')
        documents = pd.read_hdf(files_path, 'documents')
        targets_pairs = pd.read_hdf(files_path, 'targets_pairs')

        source_dataframe = pd.read_hdf(files_path,'source_dataframe')
        susp_dataframe = pd.read_hdf(files_path,'susp_dataframe')
        
    else:
        source_dataframe, susp_dataframe,dataset_encoding = load_to_pandas()

        if not allow_queries_without_relevants:
            non_plag_susp = susp_dataframe[pd.isnull(susp_dataframe.source_reference)]
            new_susp_dataframe = susp_dataframe[pd.notnull(susp_dataframe.source_reference)].reset_index()
            new_source_dataframe = source_dataframe.append(non_plag_susp, ignore_index=True)
            del source_dataframe,susp_dataframe, non_plag_susp
        else:
            new_source_dataframe = source_dataframe
            new_susp_dataframe = susp_dataframe
            
        source_dataframe = new_source_dataframe
        susp_dataframe = new_susp_dataframe

        documents = []
        queries = []
        targets_pairs = []
        
        queries_names = {}
        
        for source_id, source_row in new_source_dataframe.iterrows():
            if language_filter != "ALL" and source_row['lang'] != language_filter.lower() and source_row['lang'] != None:
                continue
            documents.append(source_id)
            susp_documents = new_susp_dataframe[new_susp_dataframe.source_reference == source_row['reference']]
            
            '''
                groups by suspicious filename each source plagiarised slices of text (pandas indexes).
            '''
            grouped_susp_dataframe = susp_documents.groupby(['reference',])
             
            '''
                selecting one index to represent the query 
            '''
            for query_namei,valuesi in grouped_susp_dataframe.groups.items():
                 
                if query_namei in queries_names.keys():
                    targets_pairs.append([queries_names[query_namei],len(documents)-1])
                else:
                    queries_names[query_namei] = len(queries)
                    queries.append(valuesi[0])
                    targets_pairs.append([len(queries)-1,len(documents)-1])
            
        if allow_queries_without_relevants:                    
            empty_queries = susp_dataframe[pd.isnull(susp_dataframe.source_reference)].index.tolist()
            queries = queries + empty_queries
            del empty_queries
            
        del queries_names
            
        documents = pd.DataFrame(documents,columns=list(['dataframe_index']))
        queries = pd.DataFrame(queries,columns=list(['dataframe_index']))
        targets_pairs = pd.DataFrame(targets_pairs,columns=list(['query_row_index','document_col_index']))
        
         
        queries.to_hdf(files_path,'queries')
        documents.to_hdf(files_path,'documents')
        targets_pairs.to_hdf(files_path,'targets_pairs')
        source_dataframe.to_hdf(files_path,'source_dataframe')
        susp_dataframe.to_hdf(files_path,'susp_dataframe')


    '''
        loading content from files
    '''    

    col = targets_pairs.loc[:,'document_col_index'].values.tolist()
    row = targets_pairs.loc[:,'query_row_index'].values.tolist()
    data = np.ones(targets_pairs.shape[0])

    dataset_target = coo_matrix((data, (row, col)), shape=(queries.shape[0], documents.shape[0]))
    del col,row,data
    dataset_target = dataset_target.tolil()

#     for id_row,source_row in documents.iterrows():
#         contenti = load_file_content(source_dataframe, source_dataframe.loc[source_row['dataframe_index'],'reference'], dataset_encoding)
#         print(id_row,'=>',len(contenti))
#         if (len(contenti) < 2):
#             print(source_row)
#             print(source_dataframe.loc[source_row['dataframe_index'],'reference'])
#             print(__file_path(source_dataframe, source_dataframe.loc[source_row['dataframe_index'],'reference']))
#             input('empty document!')
# 
#     for id_row,susp_row in queries.iterrows():
#         contenti = load_file_content(susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],'reference'], dataset_encoding)
#         print(id_row,'=>',len(contenti))
#         if (len(contenti) < 2):
#             print(susp_row)
#             print(susp_dataframe.loc[susp_row['dataframe_index'],'reference'])
#             print(__file_path(susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],'reference']))
#             input('empty query!')

    documents_content = Parallel(n_jobs=2,backend="threading",verbose=1)(delayed(load_file_content)(source_dataframe, source_dataframe.loc[source_row['dataframe_index'],'reference'], dataset_encoding) for _,source_row in documents.iterrows())        
    queries_content = Parallel(n_jobs=2,backend="threading",verbose=1)(delayed(load_file_content)(susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],'reference'], dataset_encoding) for _,susp_row in queries.iterrows())

    del source_dataframe, susp_dataframe
    
    documents = pd.DataFrame({'dataframe_index':documents.loc[:,'dataframe_index'].values.tolist(),'content':documents_content})
    queries = pd.DataFrame({'dataframe_index':queries.loc[:,'dataframe_index'].values.tolist(),'content':queries_content})
    
    del documents_content, queries_content
    return queries, documents, dataset_target, dataset_encoding
예제 #26
0
def extract_all_features_for_2(data, n_jobs=-1):
    parallel = Parallel(n_jobs=n_jobs, verbose=1)
    feature_matrix = parallel(
        delayed(extract_features_2)(d, ALL_FEATURES) for d in data)
    return np.array(feature_matrix)
예제 #27
0
def dash(k, r, ep, OPT, X, alpha, debug=True, parallel=False):

    m = 5
    S = []

    # cache values for plotting
    y_adap = []

    # for r iterations
    for i in range(r):
        T = []
        print(i)

        fS = oracle(S)
        fST = oracle(union(S, T))

        while ((fST - fS) < (ep / 20) *
               (OPT - fS)) and (len(union(S, T)) < k) and len(X) > 1:
            # FILTER Step
            # this only changes X
            vs = estimateSet(X, union(S, T), k, r, m)
            while (vs < alpha**(2) * (1 - ep) *
                   (OPT - fST) / r) and (len(X) > 0):
                # get marginal contribution
                if parallel:
                    marg_a = Parallel(n_jobs=-1)(
                        delayed(estimateMarginal)(X, union(S, T), a, k, r, m)
                        for a in X)
                else:
                    marg_a = [
                        estimateMarginal(X, union(S, T), a, k, r, m) for a in X
                    ]

                Xnew = [
                    X[idx] for idx, el in enumerate(marg_a)
                    if el >= alpha * (1 + ep / 2) * (1 - ep) * (OPT - fST) / k
                ]
                X = Xnew
                vs = estimateSet(X, union(S, T), k, r, m)

            # update sets
            R = randomSample(X, k / r)
            T = union(T, R)
            # T changes but S doesn't
            fST = oracle(union(S, T))

            # outer loop numbers
            print('Outer Loop: Val')
            print(len(union(S, T)))
            print(fST)

        S = union(S, T)
        fS = oracle(S)
        end = time.time()
        timePassed = end - start

        if 'bayes' in dat:
            y_adap.append((len(S), obj_fun(S), timePassed, fS))
        else:
            y_adap.append((len(S), obj_fun(X1[:, S], y_cat), timePassed, fS))
    return y_adap
    }]
    filename = 'results-m{}-n{}-tt{}.csv'.format(m, n, tt)
    print(filename)
    df = pd.DataFrame.from_records(data,
                                   columns=[
                                       'n', 'm', 'tt', 'ballots', 'alpha',
                                       'k_star', 'theta_star', 'k_final',
                                       'k_greedy'
                                   ])
    if S3:  # s3
        import boto3

        csv_buffer = BytesIO()
        df.to_csv(csv_buffer, encoding='utf-8')
        s3_resource = boto3.resource('s3')
        s3_resource.Object('bribery', filename).put(Body=csv_buffer.getvalue())
    else:
        abs_path = os.path.join(folder, filename)
        df.to_csv(abs_path, index=False)


if __name__ == '__main__':
    # for n in [2, 4, 8]:
    #     for m in [2, 4, 8]:
    #         alpha = borda(m)
    #         for tt in range(16):
    #             run_single(m, n, tt, alpha)
    Parallel(n_jobs=-1)(delayed(run_single)(m, n, tt, borda(m))
                        for n in [2, 4, 8, 16, 32] for m in [2, 4, 8, 16]
                        for tt in range(16))
예제 #29
0
            allow_queries_without_relevants=False)
    elif dataset_name == "pan11":
        corpus_name, (
            suspicious_info, source_info, target, dataset_encoding
        ) = dataset_name, pan_plagiarism_corpus_2011_extractor.load_as_ir_task(
            allow_queries_without_relevants=False, language_filter="EN")
    elif "pan10" in dataset_name and "-samples" in dataset_name:
        corpus_name, (
            suspicious_info, source_info, target, dataset_encoding
        ) = dataset_name, pan_plagiarism_corpus_2010_extractor.load_sample_as_ir_task(
            sample_size, language_filter="EN")

    print('queries:', suspicious_info.shape, ' Documents:', source_info.shape)

    documents = Parallel(n_jobs=-1, backend="threading", verbose=1)(
        delayed(encode_dataframe_content)(si, dataset_encoding)
        for si in source_info['content'].values)
    queries = Parallel(n_jobs=-1, backend="threading", verbose=1)(
        delayed(encode_dataframe_content)(si, dataset_encoding)
        for si in suspicious_info['content'].values)

    del suspicious_info, source_info

    print(nns_df_paramaters)
    print(lsht_df_paramaters)

    #     exit()
    '''
        using scikit-learn : tokenization
    '''
    def fit(self, X, y=None, sample_weight=None, exposure=None):
        cv = check_cv(self.cv,
                      X=X,
                      y=y,
                      classifier=is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        combiner = check_score_combiner(self.estimator, self.score_combiner)
        parallel = Parallel(n_jobs=self.n_jobs,
                            verbose=self.verbose,
                            pre_dispatch=self.pre_dispatch)
        n_features = X.shape[1]
        data = self._process_args(X=X,
                                  y=y,
                                  sample_weight=sample_weight,
                                  exposure=exposure)
        feature_deletion_scores = []

        # Get cross-validated scores with all features present
        data_ = data.copy()
        col_X = self._baseline_feature_subset(X, n_features)
        data['X'] = col_X
        full_scores = parallel(
            delayed(_fit_and_score)(clone(self.estimator), data_, scorer,
                                    train, test) for train, test in cv)
        self.score_ = combiner(full_scores)

        # For each feature, remove that feature and get the cross-validation scores
        for col in range(n_features):
            col_X = self._feature_subset(X, n_features, col)
            data_ = data.copy()
            data_['X'] = col_X
            scores = parallel(
                delayed(_fit_and_score)(clone(self.estimator), data_, scorer,
                                        train, test) for train, test in cv)
            #             test_features = np.ones(shape=n_features, dtype=bool)
            #             if col_X is not None:
            #                 data_ = data.copy()
            #                 data_['X'] = col_X
            #                 scores = parallel(delayed(_fit_and_score)(clone(self.estimator), data_, scorer,
            #                                               train, test)
            #                                       for train, test in cv)
            #
            #
            #             if n_features > 1:
            #                 test_features[col] = False
            #                 data_['X'] = X[:, test_features]
            #                 scores = parallel(delayed(_fit_and_score)(clone(self.estimator), data_, scorer,
            #                                           train, test)
            #                                   for train, test in cv)
            #             elif self.check_constant_model:
            #                 # If there's only one feature to begin with, do the fitting and scoring on a
            #                 # constant predictor.
            #                 data_['X'] = np.ones(shape=(X.shape[0], 1))
            #                 scores = parallel(delayed(_fit_and_score)(clone(self.estimator), data_, scorer,
            #                                           train, test)
            #                                   for train, test in cv)
            #             else:
            #                 scores = full_scores
            score = combiner(scores)
            feature_deletion_scores.append(score)

        # Higher scores are better.  Higher feature importance means the feature is more important.
        # This code reconciles these facts.
        self.feature_importances_ = self._calc_importances(
            np.array(feature_deletion_scores), self.score_)

        # Finally, fit on the full data set
        self.estimator_ = clone(self.estimator).fit(**data)

        # A fit method should always return self for chaining purposes
        return self