def preprocess(pd): pd = pd.str.lower() pd = pd.apply( lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)]) pd = pd.apply(lambda x: [item for item in x if item not in removing_words]) pd = pd.apply(lambda x: [stemmer.stem(y) for y in x]) pd = pd.str.join(' ') pd = pd.str.replace('[{}]'.format('$<>?@`\'"'), ' ') return pd
def preprocess_lite(pd): pd = pd.str.lower() pd = pd.str.replace( '[{}]'.format('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n\t'), ' ') pd = pd.apply(lambda x: [w for w in w_tokenizer.tokenize(x)]) pd = pd.apply(lambda x: convert_numbers(x)) pd = pd.str.join(' ') pd = pd.apply( lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)]) pd = pd.apply(lambda x: [item for item in x if len(item) > 1]) return pd
def callLambda(self, item, reload=False): if (not item is self.mainPanda.columns.values) or reload: if "@" in item: parts = item.split("@") field = parts[1] prereq = prereqMaster[field] for prq in prereq: self.callitem(prq) for an in self.anhos: pandas = self.dictpandas[an] pandas = pd.apply(lambdaMaster[field], axis=1) self.dictpandas[an] = pandas
def preprocess(pd): pd = pd.str.lower() pd = pd.str.replace('[^a-zA-Z]', ' ') pd = pd.apply(lambda x: [w for w in w_tokenizer.tokenize(x)]) pd = pd.str.join(' ') pd = pd.apply( lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)]) pd = pd.apply(lambda x: [lemmatizer.lemmatize(w, 'v') for w in x]) pd = pd.apply(lambda x: [item for item in x if item not in stop_words]) pd = pd.apply(lambda x: [item for item in x if len(item) > 3]) pd = pd.apply(lambda x: [ i[0] for i in nltk.pos_tag(x) if i[1] in ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS'] ]) pd = pd.apply(lambda x: " ".join(x)) return pd
def calcular_proporcion_facturacion_inferior_25(pd): pd['menor_percentil25'] = pd.apply( lambda x: is_menor_25porciento(x['percentil25'], x['percentil25']), axis=1) porcentaje_menor_percentil25 = pd[['grupo', 'menor_percentil25']] \ .groupby('grupo').agg(['sum', 'count']).reset_index() porcentaje_menor_percentil25['porcentaje_fact_menor_percentil_25'] = \ (porcentaje_menor_percentil25['menor_percentil25']['sum'] / (porcentaje_menor_percentil25['menor_percentil25']['count'] - 1)) porcentaje_menor_percentil25['porcentaje_fact_menor_percentil_25'] = \ porcentaje_menor_percentil25['porcentaje_fact_menor_percentil_25'].fillna(0) porcentaje_menor_percentil25 = porcentaje_menor_percentil25[[ 'grupo', 'porcentaje_fact_menor_percentil_25' ]] pd_variable_fact_inferior_25 = pd.\ merge(porcentaje_menor_percentil25, how='inner', on='grupo') return pd_variable_fact_inferior_25
def quantize(x, step): sign = pd.apply(np.sign, x) quantized = pd.apply(np.floor, (value.abs() / step) + 0.5) return (sign * quantized)
df.dtypes dates = pd.date_range("20161201", periods=7) pd = pd.DataFrame([ { "a": "1", "b": "2", "c": "3" }, { "a": "2", "b": "5", "c": "11" }, { "a": "3", "b": "2", "c": "30" }, { "a": "4", "b": "5", "c": "3" }, ]) ss = {"1": "11111", "2": "2222", "3": "3333", "4": "4444"} pd["d"] = pd.apply(lambda x: int(x["a"]) * int(x["b"]), axis=1) print pd.columns
def pd_to_int(h2o, pd): return (h2o, pd.apply(lambda x: 1 if x else 0))
def funtionICP(X, Y, ExpInd, alpha=0.1, mode="asymptotic", intercept=False): if isinstance(X, list) and X.isnumeric(): X = np.asmatrix(X, ncol=1) if not isinstance(X, np.ndarray) and not isinstance(X, pd.DataFrame): raise ValueError("'X' must be a matrix or data frame") if not isinstance(Y, np.ndarray): raise ValueError("'Y' must be a vector") if X.shape[0] <= X.shape[1]: raise ValueError( "hiddenICP not suitable for high-dimensional data (at the moment) \n -- need row > column but have nrow(X)= {} and ncol(X)={}" .format(X.shape[0], X.shape[1])) if not isinstance(ExpInd, list): # If ExpInd is not a list if len(ExpInd) != len(Y): raise Exception( "if `ExpInd' is a vector, it needs to have the same length as `Y'" ) uni = np.unique(ExpInd) if len(uni) == 1: raise Exception( "There is just one environment ('ExpInd'= {} for all observations) and the method needs at least two distinct environments sep = " .format(uni[1])) if min(Counter(ExpInd)) <= 2: print("\nOut put of 'table(ExpInd)':\n ") print(Counter(ExpInd)) raise Exception( "one environment has just one or two observations (as supllied by 'ExpInd'); there need to be at least 3 (and ideally dozens) of observations in each environment; the out put of 'table(ExpInd)' is given below to show the number of observations in each unique environment as supplied by 'ExpInd'" ) K = len(uni) ExpIndNEW = list() for uc in range(0, K): ExpIndNEW[uc] = np.where(ExpInd == uni[uc]) setattr(ExpIndNEW[uc], "value", uni[uc]) ExpInd = ExpIndNEW # Now ExpInd is a list del ExpIndNEW else: #if ExpInd is a list if min(ExpInd) < 1: raise Exception( "if `ExpInd' is a list with indicies of observations, \n minimal entry has to be at least 1 but is {}" .format(min(ExpInd))) if max(ExpInd) > len(Y): raise Exception( "if `ExpInd' is a list with indicies of observations, \n maximal entry has to be at most equal \n to the length {} of the observations but is {}" .format(len(Y), max(ExpInd))) X = pd.DataFrame(X) '''if len(ucol = set(X.shape[1])) < min(3, X.shape[1]) : colnames(X) = paste("Variable",1:X.shape[1],sep="_")''' colX = X.columns if intercept: X = np.column_stack((np.repeat(1, X.shape[0]), X)) K = len(ExpInd) p = X.shape[1] n = X.shape[0] kc = 1 if K > 2: KC = K else: KC = 1 ConfInt = np.zeros(0, shape=[2, p]) pvalues = np.repeat(1, p) for kc in range(0, KC): ins = ExpInd[kc] out = (1, n)[-ins] DS = (np.transpose(X[ins:]).dot(X[ins:])) / len(ins) - (np.transpose( X[out:]).dot(X[out:])) / len(out) Drho = (np.transpose(X[ins:]).dot(Y[ins])) / len(ins) - (np.transpose( X[out:]).dot(Y[out])) / len(out) DSI = np.linalg.solve(DS, Drho) betahat = pd.to_numeric(np.linalg.solve(DS, Drho)) if kc == 1: betahatall = betahat else: betahatall = betahatall + betahat Zin = np.zeros(shape=[len(ins), p]) Zout = np.zeros(shape=[len(out), p]) for i in range(0, len(ins)): tmp = DSI * X[ins[i], ] Zin[i, ] = pd.to_numeric(-tmp * sum(tmp * Drho) + Y[ins[i]] * tmp) for i in range(0, len(out)): tmp = DSI * X[out[i], ] Zout[i, ] = pd.to_numeric(-tmp * sum(tmp * Drho) + Y[out[i]] * tmp) sigmavec = math.sqrt( np.diag((np.cov(Zin) / len(ins) + np.cov(Zout) / len(out)))) pvalues = min( pvalues, 2 * K * (1 - norm.cdf(abs(betahat) / max(pow(10, -10), sigmavec), df=n - 1))) addvar = norm.ppf(max(0.5, 1 - alpha / (2 * K))) * sigmavec maximineffectsN = np.sign(betahat) * max(0, abs(betahat) - addvar) ConfInt[1:] = max(ConfInt[1:], betahat - addvar, True) ConfInt[2:] = min(ConfInt[2:], betahat + addvar, True) if kc == 1: maximineffects = maximineffectsN else: for varc in range(1, p + 1): if abs(maximineffectsN[varc]) > abs(maximineffects[varc]): maximineffects[varc] = maximineffectsN[varc] betahat = betahatall / KC maximinCoefficients = maximineffects if intercept: betahat = betahat[-1] maximinCoefficients = maximinCoefficients[-1] ConfInt = ConfInt[:-1] pvalues = pvalues[-1] ConfInt = pd.apply(ConfInt, 2, result_type='sort', decreasing=True) retobj = list(betahat=betahat, maximinCoefficients=maximinCoefficients, ConfInt=ConfInt, pvalues=pvalues, colnames=colX, alpha=alpha) #class(retobj) <- "hiddenInvariantCausalPrediction" return retobj
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) File "pandas/_libs/hashtable_class_helper.pxi", line 1367, in pandas._libs.hashtable.PyObjectHashTable.get_labels TypeError: unhashable type: 'list' >>> pd.get_dummies(Y[1]) She by seashells seashore. sells the 0 1 0 0 0 0 0 1 0 0 0 0 1 0 2 0 0 1 0 0 0 3 0 1 0 0 0 0 4 0 0 0 0 0 1 5 0 0 0 1 0 0 >>> Y[1] ['She', 'sells', 'seashells', 'by', 'the', 'seashore.'] >>> pd.get_dummies <function get_dummies at 0x10dfe1ea0> >>> pd.apply(get_dummies(Y)) Traceback (most recent call last): File "<pyshell#19>", line 1, in <module> pd.apply(get_dummies(Y)) AttributeError: module 'pandas' has no attribute 'apply' >>> Y.apply(get_dummies) Traceback (most recent call last): File "<pyshell#20>", line 1, in <module> Y.apply(get_dummies) NameError: name 'get_dummies' is not defined >>> Y.apply(pd.get_dummies) 0 I by seashells seashore. sell the 0 1... 1 She by seashells seashore. sells the 0... dtype: object
def quantize(x, step): sign = pd.apply(np.sign, x) quantized = pd.apply(np.floor, (value.abs() / step) + 0.5) return (sign * quantized)
def pd_to_int(h2o, pd): return (h2o, pd.apply(lambda x: 1 if x else 0))