def run_greedy(kToSelect, parallel): # get starting set of data predictors = [([], -1e10)] # loop through predictors and at each step, # add one predictor that increases R2 the most # and calculate R2 for k in range(kToSelect): logging.info(k) best_k_predictors = predictors[-1][0] predictor_list = list(set(all_predictors) - set(best_k_predictors)) def greedy_helper(predictor): k_plus_1 = list(best_k_predictors + [predictor]) x_train = X1[:, k_plus_1] return get_class_rate(x_train, y_cat) if parallel: r2 = Parallel(n_jobs=-1, verbose=50)(delayed(greedy_helper)(predictor) for predictor in predictor_list) else: r2 = [] for predictor in predictor_list: r2.append(greedy_helper(predictor)) best_k_plus_1 = best_k_predictors + [predictor_list[np.argmax(r2)]] predictors.append((best_k_plus_1, np.max(r2))) logging.info("%s %s %s" % (str(k), str(best_k_plus_1), str(np.max(r2)))) return predictors
def pairwise_jaccard_similarity(set_per_row): # print (set_per_row) results = Parallel( n_jobs=-1,backend='threading' )( delayed(jaccard_similarity)(i,j,set_per_row) for i in range(set_per_row.shape[0]) for j in range(set_per_row.shape[0]) ) results = np.array(results) return results.reshape((set_per_row.shape[0],set_per_row.shape[0]))
def transform(self, X): """Transform a count matrix to .. Parameters ---------- X : sparse matrix, [n_samples, n_features] a matrix of term/token counts Returns ------- vectors : sparse matrix, [n_samples, n_features] sample's time-to-transform : sparse matrix, [n_samples] """ if self.selection_size == None: raise ('must be fitted!') else: approach_all_finger = np.empty( (X.shape[0], self.selection_size * self.n_permutations), np.int) approach_all_time = np.zeros((approach_all_finger.shape)) r = np.array( Parallel( n_jobs=self.n_jobs, backend="multiprocessing", verbose=0)(delayed(self.measure_selection_function)( X[i, :].T, self.items_to_permute, self.n_permutations, self.selection_size, self.selection_function) for i in range(X.shape[0]))) approach_all_finger = np.vstack(r[:, 0]) approach_all_time = np.vstack(r[:, 1]) return (approach_all_finger, approach_all_time)
def _find_best_w(self, x, y, random_state): xa = x[y == self.a] xb = x[y == self.b] points = [ self.__get_random_point_on_sphere(x.shape[1], random_state) for _ in range(self.n_iters) ] if self.use_svc_weights: points.append( SVC(kernel='linear', C=1, random_state=random_state, class_weight='auto').fit(x, y).coef_[0]) rets = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(maximize_entropy)(xa, xb, point, self.method, self.on_sphere, self.base_objective) for point in points) best_i = 0 for i in range(len(points)): if self.verbose: print "Entropy with gamma {} starting from point[{}] = {}".format( self.base_objective.gamma, i, -rets[i]["fun"]) if rets[i]["fun"] < rets[best_i]["fun"]: best_i = i return rets[best_i]["x"]
def run_greedy(kToSelect, parallel=True): # get starting set of data predictors = [([], 0, 0)] # loop through predictors and at each step, # add one predictor that increases obj function the most # and calculate obj function for k in range(kToSelect): best_k_predictors = predictors[-1][0] r2 = [] predictor_list = list(set(all_predictors) - set(best_k_predictors)) if not parallel: for predictor in predictor_list: k_plus_1 = list(best_k_predictors + [predictor]) if 'bayes' in dat: r2.append(obj_fun(k_plus_1)) else: x_train = X[:, k_plus_1] r2.append(obj_fun(x_train, y_cat)) else: r2 = Parallel(n_jobs=-1)( delayed(evaluate_oracle)(best_k_predictors, a) for a in predictor_list) best_k_plus_1 = best_k_predictors + [predictor_list[np.argmax(r2)]] end = time.time() timePassed = end - start predictors.append((best_k_plus_1, np.max(r2), timePassed)) return predictors
def amortizedFilter(k, r, ep, OPT, X, debug=True, parallel=False): m = 10 S = [] y_adap = [] for i in range(r): T = [] logging.info('r=' + str(i)) fS = oracle(S) fST = oracle(union(S, T)) while ((fST - fS) < (ep / 20) * (OPT - fS)) and (len(union(S, T)) < k): # FILTER Step # this only changes X vs = estimateSet(X, union(S, T), m) while (vs < (1 - ep) * (OPT - fST) / r): if debug: logging.info('inner while loop') # get marginal contribution if parallel: marg_a = Parallel(n_jobs=-1, verbose=50)( delayed(estimateMarginal)(X, union(S, T), a, m) for a in X) else: marg_a = [ estimateMarginal(X, union(S, T), a, m) for a in X ] # Filter! Xnew = [ X[idx] for idx, el in enumerate(marg_a) if el >= (1 + ep / 2) * (1 - ep) * (OPT - fST) / k ] X = Xnew # estimate if filtered set is good enough vs = estimateSet(X, union(S, T), m) if debug: logging.info('Elements remaining: ' + str(len(X))) logging.info('Check') logging.info(vs < (1 - ep) * (OPT - fST) / r) R = randomSample(X, k / r) T = union(T, R) # T changes but S doesn't fST = oracle(union(S, T)) if debug: logging.info('Outer Loop') logging.info(fST) S = union(S, T) fS = oracle(S) y_adap.append((len(S), fS)) return y_adap
def extract_all_features(data, n_jobs=-1): """ Function extracting all available features to a numpy feature array. :param data: iterable containing domain name strings :return: feature matrix as numpy array """ parallel = Parallel(n_jobs=n_jobs, verbose=1) feature_matrix = parallel( delayed(extract_features)(d, ALL_FEATURES) for d in data) return np.array(feature_matrix)
def search_light(X, y, estimator, A, score_func=None, cv=None, n_jobs=-1, verbose=0): """Function for computing a search_light Parameters ---------- X: array-like of shape at least 2D The data to fit. y: array-like The target variable to try to predict. estimator: estimator object implementing 'fit' The object to use to fit the data A : sparse matrix. adjacency matrix. Defines for each sample the neigbhoring samples following a given structure of the data. score_func: callable, optional callable taking as arguments the fitted estimator, the test data (X_test) and the test target (y_test) if y is not None. cv: cross-validation generator, optional A cross-validation generator. If None, a 3-fold cross validation is used or 3-fold stratified cross-validation when y is supplied. n_jobs: integer, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. verbose: integer, optional The verbosity level. Defaut is 0 Returns ------- scores: array-like of shape (number of rows in A) search_light scores """ scores = np.zeros(len(A.rows), dtype=float) group_iter = GroupIterator(A.shape[0], n_jobs) scores = Parallel(n_jobs=n_jobs, verbose=verbose)(delayed( _group_iter_search_light)(list_i, A.rows[list_i], estimator, X, y, A.shape[0], score_func, cv, verbose) for list_i in group_iter) return np.concatenate(scores)
def fit(self, X, y=None, sample_weight=None, exposure=None): # For later parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch, max_nbytes=None) # Extract arguments fit_args = self._process_args(X=X, y=y, sample_weight=sample_weight, exposure=exposure) # Sort out cv parameters if self.cv == 1: cv = no_cv(X=X, y=y) else: if hasattr(self.cv, 'split'): cv_args = dict(X=X) if y is not None: cv_args['y'] = np.ravel(y) cv = self.cv.split(**cv_args) else: cv_args = dict(X=X) if y is not None: cv_args['y'] = shrinkd(1,np.asarray(y)) cv = check_cv(self.cv, classifier=is_classifier(self.estimator), **cv_args) # Do the cross validation fits # print(valmap(lambda x: x.shape, fit_args)) # print('num_folds = %d' % self.cv.get_n_splits(X=X)) cv_fits = parallel(delayed(_fit_and_predict)(clone(self.estimator), fit_args, train, test, self.verbose) for train, test in cv) # Combine predictions from cv fits prediction = np.empty_like(y) if y is not None else np.empty(shape=X.shape[0]) for fit in cv_fits: safe_assign_subset(prediction, fit[2], fit[1]) # Store cross validation models self.cv_estimators_ = [fit[0] for fit in cv_fits] self.cv_indices_ = [fit[2] for fit in cv_fits] self.cv_predictions_ = prediction # If a metric was provided, compute the score if self.metric is not None: metric_args = {} if 'sample_weight' in fit_args: metric_args['sample_weight'] = fit_args['sample_weight'] if 'exposure' in fit_args: metric_args['exposure'] = fit_args['exposure'] self.score_ = safer_call(self.metric, y, self.cv_predictions_, **metric_args) # Fit on entire data set self.estimator_ = clone(self.estimator) self.estimator_.fit(**fit_args) return self
def fit_linear_nnls(self, X, y, sample_weight=None): if not isinstance(self.model, LinearRegression): raise ValueError( 'Model is not linearRegression, could not call fit for linear nnls' ) n_jobs_ = self.model.n_jobs self.model.coef_ = [] X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y, X_offset, y_offset, X_scale = self.model._preprocess_data( X, y, fit_intercept=self.model.fit_intercept, normalize=self.model.normalize, copy=self.model.copy_X, sample_weight=sample_weight) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) if sp.issparse(X): if y.ndim < 2: # out = sparse_lsqr(X, y) out = lsq_linear(X, y, bounds=(0, np.Inf)) self.model.coef_ = out[0] self.model._residues = out[3] else: # sparse_lstsq cannot handle y with shape (M, K) outs = Parallel(n_jobs=n_jobs_)( delayed(lsq_linear)(X, y[:, j].ravel()) for j in range(y.shape[1])) self.model.coef_ = np.vstack(out[0] for out in outs) self.model._residues = np.vstack(out[3] for out in outs) else: # self.model.coef_, self.model.cost_, self.model.fun_, self.model.optimality_, self.model.active_mask_, # self.model.nit_, self.model.status_, self.model.message_, self.model.success_\ out = lsq_linear(X, y, bounds=(0, np.Inf)) self.model.coef_ = out.x self.model.coef_ = self.model.coef_.T if y.ndim == 1: self.model.coef_ = np.ravel(self.model.coef_) self.model._set_intercept(X_offset, y_offset, X_scale) return self.model
def fit(self, X, y=None, sample_weight=None, exposure=None): if self.cv == 1: cv = no_cv(X=X, y=y) else: if hasattr(self.cv, 'split'): cv = self.cv.split(X, y) else: cv = check_cv(self.cv, X=X, y=y, classifier=is_classifier(self.calibrator)) parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch) # Fit the estimator on each train set and predict with it on each test set fit_args = {'X': X} if y is not None: fit_args['y'] = y if self.est_weight and sample_weight is not None: fit_args['sample_weight'] = sample_weight if self.est_exposure and exposure is not None: fit_args['exposure'] = exposure # Do the cross validation fits cv_fits = parallel(delayed(_fit_and_predict)(clone(self.estimator), fit_args, train, test) for train, test in cv) # Combine predictions from cv fits prediction = np.empty_like(y) for fit in cv_fits: safe_assign_subset(prediction, fit[2], fit[1]) # fit_predict_results = parallel(delayed(_fit_and_predict)(estimator=clone(self.estimator), # train=train, test=test, **fit_args) for train, test in cv) # # # Combine the predictions # prediction = np.empty_like(y) # for _, pred, _, test in zip(fit_predict_results, cv): # prediction = np.concatenate([pred for _, pred in cv_fits], axis=0) # Fit the calibrator on the predictions cal_args = {'X': prediction[:, None] if len(prediction.shape) == 1 else prediction, 'y': y} if self.cal_weight and sample_weight is not None: cal_args['sample_weight'] = sample_weight if self.cal_exposure and exposure is not None: cal_args['exposure'] = exposure self.calibrator_ = clone(self.calibrator).fit(**cal_args) # Fit the estimator on the entire data set self.estimator_ = clone(self.estimator).fit(**fit_args) return self
def classify_pcaps(path_to_pcaps_folder, clf: Clf, n_jobs=2): pcaps = [] for (path, dirs, files) in walk(path_to_pcaps_folder): pcaps.extend(files) break pcaps = [path_to_pcaps_folder + '/' + pcap for pcap in pcaps if not pcap.endswith('.txt')] parallel = Parallel(n_jobs=n_jobs, verbose=1) parallel( delayed(_classify_pcap)(pcap, clf) for pcap in pcaps )
def example_simple_function(self, active_process_path): """ simple function to calculate sqrt """ active_process = self.portal_activities.unrestrictedTraverse( active_process_path) # Use CMFActivity as a backend for joblob with parallel_backend('CMFActivity', active_process=active_process): result = Parallel(n_jobs=2, pre_dispatch='all', timeout=30, verbose=30)(delayed(sqrt)(i**2) for i in range(5)) # Set result value and an id to the active result and post it result = ActiveResult(result=result) active_process.postResult(result) log("joblib activity result", result) return result
def _find_best_w(self, x, y, random_state): points = [self.__get_random_point_on_sphere(x.shape[1], random_state) for _ in range(self.n_starts)] if self.use_pca_weights: pca = decomposition.PCA(n_components=int(self.base_objective.k)) pca.fit(x) points[-1] = pca.components_ rets = Parallel(n_jobs=self.n_jobs)( delayed(maximize_entropy)(x, y, point, self.method, self.on_manifold, self.base_objective) for point in points) best_i = 0 for i in range(len(points)): if rets[i]["fun"] < rets[best_i]["fun"]: best_i = i return rets[best_i]["x"].reshape(-1, self.base_objective.k).T
def parallel_predict(estimator, X, n_jobs=1, method='predict', batches_per_job=3): """ Run sklearn classifier prediction in parallel. """ n_jobs = max(cpu_count() + 1 + n_jobs, 1) # XXX: this should really be done by joblib n_batches = batches_per_job * n_jobs n_samples = len(X) batch_size = int(np.ceil(n_samples / n_batches)) parallel = Parallel(n_jobs=n_jobs, backend="threading") results = parallel( delayed(_predict, check_pickle=False)(estimator, X, method, i, i + batch_size) for i in range(0, n_samples, batch_size)) return np.concatenate(results)
def generate_mappings_for_genes(genes_of_interest, batch_size, use_parallel): # filter gene names alreay present in the database genes_of_interest = filter_gene_names_present_in_database( genes_of_interest) # Create batches genes_of_interest_batches = [ genes_of_interest[i:i + batch_size] for i in range(0, len(genes_of_interest), batch_size) ] n_batches = len(genes_of_interest_batches) n_genes = len(genes_of_interest) # Annotate the genes in batches _log.info("Starting the mapping of batched analysis of '" + str(n_genes) + "' over '" + str(n_batches) + "' batches") succeeded_genes = 0 for batch_counter, gene_batch in enumerate(genes_of_interest_batches): _log.info("Starting batch '" + str(batch_counter + 1) + "' out of '" + str(n_batches) + "', with '" + str(len(gene_batch)) + "' genes") gene_mappings = [] if use_parallel: gene_mappings = Parallel( n_jobs=CalculateNumberOfActiveThreads(batch_size))( delayed(generate_gene_to_swissprot_mapping)(gene) for gene in gene_batch) else: gene_mappings = [ generate_gene_to_swissprot_mapping(gene) for gene in gene_batch ] # add the batches to the database for gene_mapping in gene_mappings: add_gene_mapping_to_database(gene_mapping) succeeded_genes += 1 _log.info("Finished batch '" + str(batch_counter + 1) + "' out of '" + str(n_batches) + "'") _log.info("Finished the mapping of batched analysis of '" + str(n_genes) + "' over '" + str(n_batches) + "' batches, resulting in '" + str(succeeded_genes) + "' successful gene mappings")
def _parallel_pairwise(X, Y, func, n_jobs, **kwds): if n_jobs < 0: n_jobs = max(cpu_count() + 1 + n_jobs, 1) if Y is None: Y = X if n_jobs == 1: # Special case to avoid picklability checks in delayed return func(X, Y, **kwds) # TODO: in some cases, backend='threading' may be appropriate fd = delayed(func) ret = Parallel(n_jobs=n_jobs, verbose=0)(fd(X, Y[s], **kwds) for s in gen_even_slices(Y.shape[0], n_jobs)) return np.hstack(ret)
def predict_proba(self, X, π=40): """For each x in X, provide vector of probs for each class. :param X: data, a sequence of URLs :param π: int, threshold for parallelizing. Below this don't bother. URLs: JPL7 model assumes`get_html(url)` will retrieve HTML as required. Parallel logic modified from QingKaiKong. Also viewed pomegranate and scikit-issues. * http://qingkaikong.blogspot.com/2016/12/python-parallel-method-in-class.html * https://github.com/jmschrei/pomegranate/blob/master/pomegranate/parallel.pyx * https://github.com/scikit-learn/scikit-learn/issues/7448 """ # check_is_fitted(self, ['X_', 'y_']) # X = check_array(X) if type(X) is str: raise (AttributeError, "predict_proba: X must be array-like, not string!") n, n_jobs = len(X), self.n_jobs if n < π: n_jobs = 1 # Not worth the overhead to parallelize batches = X else: starts = [i * n // n_jobs for i in range(n_jobs)] ends = starts[1:] + [n] batches = [X[start:end] for start, end in zip(starts, ends)] t0 = arrow.now() if n_jobs > 1: score = delayed(batch_score_urls) with Parallel(n_jobs=n_jobs) as parallel: results = parallel(score(batch, self) for batch in batches) else: results = (batch_score_urls(batch, self) for batch in batches) results = np.concatenate( [np.array([row for row in batch]) for batch in results]) dt = arrow.now() - t0 logging.info('TIMING: n_jobs = %d, t = %s, dt = **%3.3fs**' % (n_jobs, t0.format('HH:mm:ss'), dt.total_seconds())) return results
def fit(self, X, y_clf, y_regression): """ Fit the multiclass model. Parameters ---------- X : numpy array of shape [n_samples,n_features] Training data y_clf : numpy array of shape [n_samples] Target classes for classification model y_regression: numpy array of shape [n_samples] Target values for regression model Returns ------- self : returns an instance of self. """ X = safe_asarray(X) y_clf = np.asarray(y_clf) y_regression = np.asarray(y_regression) self.clf_model = self.clf.fit(X, y_clf) classes = set(y_clf) regr = self.regr def _generator(): for class_ in classes: examples = y_clf == class_ yield class_, X[examples], y_regression[examples], regr out = Parallel(self.n_jobs, self.verbose, self.pre_dispatch)(\ delayed(_fit_helper)(*params) for params in _generator()) self.regression_models = {} for class_, regr_model in out: self.regression_models[class_] = regr_model return self
def predict_all_plain(self, domains, n_jobs=-1): """ Predicts the label of d using all classifiers present :param domains: iterable containing domains as str :return: dictionry containing results (dga, svm/rf):label """ feature_matrix = extract_all_features(domains) parallel = Parallel(n_jobs=n_jobs, verbose=1) res = parallel( delayed(_predict)(c, feature_matrix) for c in self.clfs ) # TODO merged = {} for d in res: keys = list(d.keys()) if keys[0] in res: merged[keys[0]] += d[keys[0]] else: merged.update(d) return merged
def predict(self, X, return_class_prediction=False): """ Predict using the muticlass regression model Parameters ---------- X : numpy array of shape [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Returns predicted values. """ X = safe_asarray(X) y_clf_predicted = np.asarray(self.clf_model.predict(X)) classes = set(y_clf_predicted) def _generator(): for class_ in classes: examples = y_clf_predicted == class_ yield examples, X[examples], self.regression_models[class_] out = Parallel(self.n_jobs, self.verbose, self.pre_dispatch)(\ delayed(_predict_helper)(*params) for params in _generator()) y_regr_predicted = None for examples, predicted in out: if y_regr_predicted is None: y_regr_predicted = np.zeros(X.shape[0], predicted.dtype) y_regr_predicted[examples] = predicted if return_class_prediction: return y_clf_predicted, y_regr_predicted else: return y_regr_predicted
def getDomanListFeature(domain_list): parallel = Parallel(n_jobs=-1, verbose=1) feature_matrix = parallel( delayed(getFeature)(d, datetime.datetime.strptime( '20180507', "%Y%m%d")) for d in domain_list) return feature_matrix
def load_sample_as_ir_task(sample_threshold=100, language_filter="ALL"): documents, queries, targets_pairs, source_dataframe, susp_dataframe, dataset_encoding = _load_as_ir_task_without_content( allow_queries_without_relevants=True, language_filter=language_filter) sample_targets_pairs = targets_pairs[ targets_pairs.query_row_index < sample_threshold] sample_document_indexes = list( set(sample_targets_pairs.loc[:, 'document_col_index'].tolist())) sample_queries_indexes = list( set(sample_targets_pairs.loc[:, 'query_row_index'].tolist())) col, row = [], [] for _, sample_target_row in sample_targets_pairs.iterrows(): new_row_id = sample_queries_indexes.index( sample_target_row['query_row_index']) new_col_id = sample_document_indexes.index( sample_target_row['document_col_index']) col.append(new_col_id) row.append(new_row_id) # print(sample_target_row['document_col_index'],'=',new_col_id) # print(sample_target_row['query_row_index'],'=',new_row_id) col = list(col) row = list(row) data = np.ones(len(row)) dataset_target = coo_matrix( (data, (row, col)), shape=(len(sample_queries_indexes), len(sample_document_indexes))) del col, row, data dataset_target = dataset_target.tolil() # print(sample_targets_pairs) # print(sample_targets_pairs.shape,'=',dataset_target.shape,len(row),len(col),len(data)) documents_content = Parallel(n_jobs=2, backend="threading", verbose=1)( delayed(load_file_content)(source_dataframe, source_dataframe.loc[ documents.loc[source_col_id, 'dataframe_index'], 'reference'], dataset_encoding) for source_col_id in sample_document_indexes) # print(len(documents_content)) queries_content = Parallel(n_jobs=2, backend="threading", verbose=1)( delayed(load_file_content)(susp_dataframe, susp_dataframe.loc[ queries.loc[susp_row_id, 'dataframe_index'], 'reference'], dataset_encoding) for susp_row_id in sample_queries_indexes) # print(len(queries_content)) documents = pd.DataFrame({ 'dataframe_index': documents.loc[sample_document_indexes, 'dataframe_index'].values.tolist(), 'content': documents_content }) queries = pd.DataFrame({ 'dataframe_index': queries.loc[sample_queries_indexes, 'dataframe_index'].values.tolist(), 'content': queries_content }) del sample_document_indexes, sample_queries_indexes, sample_targets_pairs del source_dataframe, susp_dataframe del documents_content, queries_content # exit() return queries, documents, dataset_target, dataset_encoding
def _load_ir_task_content(documents, queries, targets_pairs, source_dataframe, susp_dataframe, dataset_encoding): ''' loading content from files ''' col = targets_pairs.loc[:, 'document_col_index'].values.tolist() row = targets_pairs.loc[:, 'query_row_index'].values.tolist() data = np.ones(targets_pairs.shape[0]) dataset_target = coo_matrix((data, (row, col)), shape=(queries.shape[0], documents.shape[0])) del col, row, data dataset_target = dataset_target.tolil() for id_row, source_row in documents.iterrows(): contenti = load_file_content( source_dataframe, source_dataframe.loc[source_row['dataframe_index'], 'reference'], dataset_encoding) print(id_row, '=>', len(contenti)) if (len(contenti) < 2): print(source_row) print(source_dataframe.loc[source_row['dataframe_index'], 'reference']) # print(__file_path(source_dataframe, source_dataframe.loc[source_row['dataframe_index'],'reference'])) input('empty document!') for id_row, susp_row in queries.iterrows(): contenti = load_file_content( susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'], 'reference'], dataset_encoding) print(id_row, '=>', len(contenti)) if (len(contenti) < 2): print(susp_row) print(susp_dataframe.loc[susp_row['dataframe_index'], 'reference']) # print(__file_path(susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],'reference'])) input('empty query!') documents_content = Parallel(n_jobs=2, backend="threading", verbose=1)( delayed(load_file_content)(source_dataframe, source_dataframe.loc[ source_row['dataframe_index'], 'reference'], dataset_encoding) for _, source_row in documents.iterrows()) queries_content = Parallel(n_jobs=2, backend="threading", verbose=1)( delayed(load_file_content)(susp_dataframe, susp_dataframe.loc[ susp_row['dataframe_index'], 'reference'], dataset_encoding) for _, susp_row in queries.iterrows()) del source_dataframe, susp_dataframe documents = pd.DataFrame({ 'dataframe_index': documents.loc[:, 'dataframe_index'].values.tolist(), 'content': documents_content }) queries = pd.DataFrame({ 'dataframe_index': queries.loc[:, 'dataframe_index'].values.tolist(), 'content': queries_content }) del documents_content, queries_content return queries, documents, dataset_target, dataset_encoding
def load_as_ir_task(allow_queries_without_relevants = True, language_filter = 'ALL'): ''' 11093 source documents (index) X 11093 suspicious documents (queries) (allowing queries without relevants) ''' path = datasets_extractors['DATASETS_PATH']['pan_plagiarism_corpus_2011'] if allow_queries_without_relevants: files_path = os.path.join(path,"ir_task_PAN11(%s)_allows.h5"%(language_filter)) else: files_path = os.path.join(path,"ir_task_PAN11(%s).h5"%(language_filter)) if os.path.exists(files_path): source_dataframe, susp_dataframe,dataset_encoding = load_to_pandas() del source_dataframe, susp_dataframe #load and return queries = pd.read_hdf(files_path, 'queries') documents = pd.read_hdf(files_path, 'documents') targets_pairs = pd.read_hdf(files_path, 'targets_pairs') source_dataframe = pd.read_hdf(files_path,'source_dataframe') susp_dataframe = pd.read_hdf(files_path,'susp_dataframe') else: source_dataframe, susp_dataframe,dataset_encoding = load_to_pandas() if not allow_queries_without_relevants: non_plag_susp = susp_dataframe[pd.isnull(susp_dataframe.source_reference)] new_susp_dataframe = susp_dataframe[pd.notnull(susp_dataframe.source_reference)].reset_index() new_source_dataframe = source_dataframe.append(non_plag_susp, ignore_index=True) del source_dataframe,susp_dataframe, non_plag_susp else: new_source_dataframe = source_dataframe new_susp_dataframe = susp_dataframe source_dataframe = new_source_dataframe susp_dataframe = new_susp_dataframe documents = [] queries = [] targets_pairs = [] queries_names = {} for source_id, source_row in new_source_dataframe.iterrows(): if language_filter != "ALL" and source_row['lang'] != language_filter.lower() and source_row['lang'] != None: continue documents.append(source_id) susp_documents = new_susp_dataframe[new_susp_dataframe.source_reference == source_row['reference']] ''' groups by suspicious filename each source plagiarised slices of text (pandas indexes). ''' grouped_susp_dataframe = susp_documents.groupby(['reference',]) ''' selecting one index to represent the query ''' for query_namei,valuesi in grouped_susp_dataframe.groups.items(): if query_namei in queries_names.keys(): targets_pairs.append([queries_names[query_namei],len(documents)-1]) else: queries_names[query_namei] = len(queries) queries.append(valuesi[0]) targets_pairs.append([len(queries)-1,len(documents)-1]) if allow_queries_without_relevants: empty_queries = susp_dataframe[pd.isnull(susp_dataframe.source_reference)].index.tolist() queries = queries + empty_queries del empty_queries del queries_names documents = pd.DataFrame(documents,columns=list(['dataframe_index'])) queries = pd.DataFrame(queries,columns=list(['dataframe_index'])) targets_pairs = pd.DataFrame(targets_pairs,columns=list(['query_row_index','document_col_index'])) queries.to_hdf(files_path,'queries') documents.to_hdf(files_path,'documents') targets_pairs.to_hdf(files_path,'targets_pairs') source_dataframe.to_hdf(files_path,'source_dataframe') susp_dataframe.to_hdf(files_path,'susp_dataframe') ''' loading content from files ''' col = targets_pairs.loc[:,'document_col_index'].values.tolist() row = targets_pairs.loc[:,'query_row_index'].values.tolist() data = np.ones(targets_pairs.shape[0]) dataset_target = coo_matrix((data, (row, col)), shape=(queries.shape[0], documents.shape[0])) del col,row,data dataset_target = dataset_target.tolil() # for id_row,source_row in documents.iterrows(): # contenti = load_file_content(source_dataframe, source_dataframe.loc[source_row['dataframe_index'],'reference'], dataset_encoding) # print(id_row,'=>',len(contenti)) # if (len(contenti) < 2): # print(source_row) # print(source_dataframe.loc[source_row['dataframe_index'],'reference']) # print(__file_path(source_dataframe, source_dataframe.loc[source_row['dataframe_index'],'reference'])) # input('empty document!') # # for id_row,susp_row in queries.iterrows(): # contenti = load_file_content(susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],'reference'], dataset_encoding) # print(id_row,'=>',len(contenti)) # if (len(contenti) < 2): # print(susp_row) # print(susp_dataframe.loc[susp_row['dataframe_index'],'reference']) # print(__file_path(susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],'reference'])) # input('empty query!') documents_content = Parallel(n_jobs=2,backend="threading",verbose=1)(delayed(load_file_content)(source_dataframe, source_dataframe.loc[source_row['dataframe_index'],'reference'], dataset_encoding) for _,source_row in documents.iterrows()) queries_content = Parallel(n_jobs=2,backend="threading",verbose=1)(delayed(load_file_content)(susp_dataframe, susp_dataframe.loc[susp_row['dataframe_index'],'reference'], dataset_encoding) for _,susp_row in queries.iterrows()) del source_dataframe, susp_dataframe documents = pd.DataFrame({'dataframe_index':documents.loc[:,'dataframe_index'].values.tolist(),'content':documents_content}) queries = pd.DataFrame({'dataframe_index':queries.loc[:,'dataframe_index'].values.tolist(),'content':queries_content}) del documents_content, queries_content return queries, documents, dataset_target, dataset_encoding
def extract_all_features_for_2(data, n_jobs=-1): parallel = Parallel(n_jobs=n_jobs, verbose=1) feature_matrix = parallel( delayed(extract_features_2)(d, ALL_FEATURES) for d in data) return np.array(feature_matrix)
def dash(k, r, ep, OPT, X, alpha, debug=True, parallel=False): m = 5 S = [] # cache values for plotting y_adap = [] # for r iterations for i in range(r): T = [] print(i) fS = oracle(S) fST = oracle(union(S, T)) while ((fST - fS) < (ep / 20) * (OPT - fS)) and (len(union(S, T)) < k) and len(X) > 1: # FILTER Step # this only changes X vs = estimateSet(X, union(S, T), k, r, m) while (vs < alpha**(2) * (1 - ep) * (OPT - fST) / r) and (len(X) > 0): # get marginal contribution if parallel: marg_a = Parallel(n_jobs=-1)( delayed(estimateMarginal)(X, union(S, T), a, k, r, m) for a in X) else: marg_a = [ estimateMarginal(X, union(S, T), a, k, r, m) for a in X ] Xnew = [ X[idx] for idx, el in enumerate(marg_a) if el >= alpha * (1 + ep / 2) * (1 - ep) * (OPT - fST) / k ] X = Xnew vs = estimateSet(X, union(S, T), k, r, m) # update sets R = randomSample(X, k / r) T = union(T, R) # T changes but S doesn't fST = oracle(union(S, T)) # outer loop numbers print('Outer Loop: Val') print(len(union(S, T))) print(fST) S = union(S, T) fS = oracle(S) end = time.time() timePassed = end - start if 'bayes' in dat: y_adap.append((len(S), obj_fun(S), timePassed, fS)) else: y_adap.append((len(S), obj_fun(X1[:, S], y_cat), timePassed, fS)) return y_adap
}] filename = 'results-m{}-n{}-tt{}.csv'.format(m, n, tt) print(filename) df = pd.DataFrame.from_records(data, columns=[ 'n', 'm', 'tt', 'ballots', 'alpha', 'k_star', 'theta_star', 'k_final', 'k_greedy' ]) if S3: # s3 import boto3 csv_buffer = BytesIO() df.to_csv(csv_buffer, encoding='utf-8') s3_resource = boto3.resource('s3') s3_resource.Object('bribery', filename).put(Body=csv_buffer.getvalue()) else: abs_path = os.path.join(folder, filename) df.to_csv(abs_path, index=False) if __name__ == '__main__': # for n in [2, 4, 8]: # for m in [2, 4, 8]: # alpha = borda(m) # for tt in range(16): # run_single(m, n, tt, alpha) Parallel(n_jobs=-1)(delayed(run_single)(m, n, tt, borda(m)) for n in [2, 4, 8, 16, 32] for m in [2, 4, 8, 16] for tt in range(16))
allow_queries_without_relevants=False) elif dataset_name == "pan11": corpus_name, ( suspicious_info, source_info, target, dataset_encoding ) = dataset_name, pan_plagiarism_corpus_2011_extractor.load_as_ir_task( allow_queries_without_relevants=False, language_filter="EN") elif "pan10" in dataset_name and "-samples" in dataset_name: corpus_name, ( suspicious_info, source_info, target, dataset_encoding ) = dataset_name, pan_plagiarism_corpus_2010_extractor.load_sample_as_ir_task( sample_size, language_filter="EN") print('queries:', suspicious_info.shape, ' Documents:', source_info.shape) documents = Parallel(n_jobs=-1, backend="threading", verbose=1)( delayed(encode_dataframe_content)(si, dataset_encoding) for si in source_info['content'].values) queries = Parallel(n_jobs=-1, backend="threading", verbose=1)( delayed(encode_dataframe_content)(si, dataset_encoding) for si in suspicious_info['content'].values) del suspicious_info, source_info print(nns_df_paramaters) print(lsht_df_paramaters) # exit() ''' using scikit-learn : tokenization '''
def fit(self, X, y=None, sample_weight=None, exposure=None): cv = check_cv(self.cv, X=X, y=y, classifier=is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) combiner = check_score_combiner(self.estimator, self.score_combiner) parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch) n_features = X.shape[1] data = self._process_args(X=X, y=y, sample_weight=sample_weight, exposure=exposure) feature_deletion_scores = [] # Get cross-validated scores with all features present data_ = data.copy() col_X = self._baseline_feature_subset(X, n_features) data['X'] = col_X full_scores = parallel( delayed(_fit_and_score)(clone(self.estimator), data_, scorer, train, test) for train, test in cv) self.score_ = combiner(full_scores) # For each feature, remove that feature and get the cross-validation scores for col in range(n_features): col_X = self._feature_subset(X, n_features, col) data_ = data.copy() data_['X'] = col_X scores = parallel( delayed(_fit_and_score)(clone(self.estimator), data_, scorer, train, test) for train, test in cv) # test_features = np.ones(shape=n_features, dtype=bool) # if col_X is not None: # data_ = data.copy() # data_['X'] = col_X # scores = parallel(delayed(_fit_and_score)(clone(self.estimator), data_, scorer, # train, test) # for train, test in cv) # # # if n_features > 1: # test_features[col] = False # data_['X'] = X[:, test_features] # scores = parallel(delayed(_fit_and_score)(clone(self.estimator), data_, scorer, # train, test) # for train, test in cv) # elif self.check_constant_model: # # If there's only one feature to begin with, do the fitting and scoring on a # # constant predictor. # data_['X'] = np.ones(shape=(X.shape[0], 1)) # scores = parallel(delayed(_fit_and_score)(clone(self.estimator), data_, scorer, # train, test) # for train, test in cv) # else: # scores = full_scores score = combiner(scores) feature_deletion_scores.append(score) # Higher scores are better. Higher feature importance means the feature is more important. # This code reconciles these facts. self.feature_importances_ = self._calc_importances( np.array(feature_deletion_scores), self.score_) # Finally, fit on the full data set self.estimator_ = clone(self.estimator).fit(**data) # A fit method should always return self for chaining purposes return self