def test_Hungarian(self): lectEx = [[14, 5, 8, 7], [2, 12, 6, 5], [7, 8, 3, 9], [2, 4, 6, 10]] assign = DataFrame([ [0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], ]) assign.columns = assign.columns + 1 assign.index = assign.index + 1 reducedDF = DataFrame([ [10, 0, 3, 0], [0, 9, 3, 0], [5, 5, 0, 4], [0, 1, 3, 5], ]).applymap(lambda x: float(x)) reducedDF.columns = reducedDF.columns + 1 reducedDF.index = reducedDF.index + 1 h = OR.Hungarian.new(lectEx) j = h.solve(echo=False) assert j.reducedDF.equals(reducedDF) assert j.assignedDF.equals(assign)
def etl(self, test_data, agg_time, type): from numpy import nan if isinstance(test_data, list): data = DataFrame(test_data) if type == 0: data.columns = [ 'serial_number', 'manufacturer', 'vendor', 'collect_time', 'mca_id', 'transaction' ] elif type == 1: data.columns = [ 'collect_time', '1_hwerr_f', '1_hwerr_e', '2_hwerr_c', '2_sel', '3_hwerr_n', '2_hwerr_s', '3_hwerr_m', '1_hwerr_st', '1_hw_mem_c', '3_hwerr_p', '2_hwerr_ce', '3_hwerr_as', '1_ke', '2_hwerr_p', '3_hwerr_kp', '1_hwerr_fl', '3_hwerr_r', '_hwerr_cd', '3_sup_mce_note', '3_cmci_sub', '3_cmci_det', '3_hwerr_pi', '3_hwerr_o', '3_hwerr_mce_l', 'serial_number', 'manufacturer', 'vendor' ] elif type == 2: data.columns = [ 'serial_number', 'manufacturer', 'vendor', 'memory', 'rankid', 'bankid', 'collect_time', 'row', 'col' ] data[-1] = pd.to_datetime(data['collect_time']).dt.ceil(agg_time) group_data = data.groupby(['serial_number', 'collect_time'], as_index=False).agg('sum') return group_data else: return None
def _finalize_output(self, frame: DataFrame) -> DataFrame: """ Processes data read in based on kwargs. Parameters ---------- frame: DataFrame The DataFrame to process. Returns ------- DataFrame The processed DataFrame. """ num_cols = len(frame.columns) multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. self.names = list( range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names # error: Incompatible types in assignment (expression has type # "Union[List[Union[Union[str, int, float, bool], Union[Period, Timestamp, # Timedelta, Any]]], Index]", variable has type "Index") [assignment] frame.columns, frame = self._do_date_conversions( # type: ignore[assignment] frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] else: # String case if item not in frame.columns: raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) # Clear names if headerless and no name given if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: try: frame = frame.astype(self.kwds.get("dtype")) except TypeError as e: # GH#44901 reraise to keep api consistent raise ValueError(e) return frame
def _finalize_output(self, frame: DataFrame) -> DataFrame: """ Processes data read in based on kwargs. Parameters ---------- frame: DataFrame The DataFrame to process. Returns ------- DataFrame The processed DataFrame. """ num_cols = len(frame.columns) multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. self.names = list( range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names frame.columns, frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] else: # String case if item not in frame.columns: raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) # Clear names if headerless and no name given if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) return frame
def get_data_frame(self): data = DataFrame(self.data) data.columns = ['学校', '考试方式', '院系所', '', '专业', '学习方式', '研究方向', '指导教师', '拟招生人数', '备注'] data.drop(labels='', axis=1, inplace=True) data.to_csv(self.provinceName + "研究生招生信息.csv", encoding="utf_8_sig", index=False)
def get_result(self): if self._is_series: if self.axis == 0: new_data = com._concat_compat([x.get_values() for x in self.objs]) name = com._consensus_name_attr(self.objs) return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) if columns is not None: tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=True) return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat')
def get_result(self): if self._is_series: if self.axis == 0: new_data = com._concat_compat([x.get_values() for x in self.objs]) name = com._consensus_name_attr(self.objs) return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) if columns is not None: tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat')
def get_trends(self, df: DataFrame): ans = {} df["date"] = pd.to_datetime( df["date"]).apply(lambda x: (datetime.today() - x).days) df.columns = df.columns.to_series().replace({"date": "days"}) df = df[df["days"] <= 720] drop_asins = (df.groupby("asin").size().sort_values()[ df.groupby("asin").size().sort_values() <= 60]) df = df[~df["asin"].isin(drop_asins.index)] for index in range(df["category_id"].nunique()): current_category_id = df["category_id"].unique()[index] sumcoef = [] for asin in df[df["category_id"] == current_category_id]["asin"].unique(): try: sts = sm.tsa.seasonal_decompose( df[df["asin"] == f"{asin}"]['ranking'], period=30) X = sts.trend[sts.trend.notna()].index y = sts.trend[sts.trend.notna()].values.astype("int") model = linear_model.LinearRegression() model.fit(np.array(X).reshape(-1, 1), y) sumcoef.append(model.coef_) except: print(f"An exception occurred with asin {asin}") ans[current_category_id] = np.array(sumcoef).mean() finans = { k: v for k, v in sorted(ans.items(), key=lambda item: item[1]) } return finans
def processCSVMatrix(file): with open(file, 'r') as csvfile: dialect = Sniffer().sniff(csvfile.readline()) df = DataFrame() for chunk in read_csv(file, sep=dialect.delimiter, mangle_dupe_cols=True, index_col=False, chunksize=1000): df = concat([df, chunk], ignore_index=True) nodes = df.columns.values.tolist() nodes.pop(0) df["Unnamed: 0"] = nodes df = df.rename(columns={'Unnamed: 0': 'name'}) df = df.set_index(keys='name') # Remove underscores in names names = df.columns.tolist() names = [name.replace('_', ' ') for name in names] df.columns = names df.set_index([df.columns], inplace=True) return df
def classifyTestData(testFilePath,modelRoot): """ This method calls the traverseDecisionTreeModel() to classify the test data on the trained model and generate Confusion matrix and error at the given depth :param testFilePath: Path to the test file :param modelRoot: Root node of the decision tree of the trained model """ correctlyClassifiedInstances=0 incorrectlyClassifiedInstances=0 testDataList=[] input=open(testFilePath,'rU') csvObject=csv.reader(input) label = featureList[len(featureList) -1] classLabels = featureAndValueMapping.get(label) classLabelCount = len(classLabels) ConfusionMatrix = [[0 for x in range(int(classLabelCount))] for x in range(int(classLabelCount))] for row in csvObject: predictedLabel=traverseDecisionTreeModel(row,root) ConfusionMatrix[int(row[len(row)- 1]) - 1][int(predictedLabel) - 1] += 1 if predictedLabel==row[len(row)-1]: correctlyClassifiedInstances+=1 else: incorrectlyClassifiedInstances+=1 df = DataFrame(ConfusionMatrix) df.columns = classLabels df.index = classLabels print "Confusion Matrix :: \n" print df print "Correctly Classified Instance ",correctlyClassifiedInstances print "Incorrectly Classified Instance ",incorrectlyClassifiedInstances
def excel_save(List, sheet_title): df = DataFrame(List) df.columns = ['排名', 'AV号', 'UP名', '标题', '综合评分', '总播放量', '投币数量', '弹幕总数'] writer = pd.ExcelWriter('B站{0}综合排行榜前100视频.xlsx'.format(sheet_title)) df.to_excel(excel_writer=writer, index=False, encoding='utf-8', sheet_name=sheet_title) writer.save() writer.close()
def get_result(self): # series only if self._is_series: # stack blocks if self.axis == 0: new_data = com._concat_compat([x._values for x in self.objs]) name = com._consensus_name_attr(self.objs) return (Series(new_data, index=self.new_axes[0], name=name, dtype=new_data.dtype) .__finalize__(self, method='concat')) # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) # checks if the column variable already stores valid column # names (because set via the 'key' argument in the 'concat' # function call. If that's not the case, use the series names # as column names if (columns.equals(Index(np.arange(len(self.objs)))) and not self.ignore_index): columns = np.array([data[i].name for i in range(len(data))], dtype='object') indexer = isnull(columns) if indexer.any(): columns[indexer] = np.arange(len(indexer[indexer])) tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') # combine block managers else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() return (self.objs[0]._from_axes(new_data, self.new_axes) .__finalize__(self, method='concat'))
def ListoDF(data): if isinstance(data, list): Df = DataFrame(data) # 转为数据框 Df.columns = Df.iloc[0, :] # 修改列名 Df.drop(0, axis=0, inplace=True) # 删除第一行 else: Df = data return Df
def output_keywords(wordlist): wordlist = list(map( list,wordlist )) data = DataFrame(wordlist) data.columns = ['關鍵詞','分數'] data['分數'] = data['分數'] * 10000 data.sort_values("分數",inplace=True,ascending=False) data = data.round({'分數':2}) return data
def fix_tax(q): # q = opts.i taxl = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__'] data1 = pd.read_csv(q, header=0, sep="\t") taxwz = data1.columns.tolist().index('taxonomy') # taxonomy的位置 # Unknown 替换 data1.iloc[:, taxwz].replace("__Unknown_\w*", "__Unknown", regex=True, inplace=True) # taxonomy 拆分 data2 = [i.split(";") for i in data1.iloc[:, taxwz]] bb = [] for j in range(len(data2)): bb.append(data2[j] + [""] * (7 - len(data2[j]))) data3 = DataFrame(bb) data3.columns = taxl regex = re.compile(r'(;s__[a-zA-Z0-9_-]*)', flags=re.IGNORECASE) data3['tax'] = data1.iloc[:, taxwz].replace(regex, "") data3['OTU_ID'] = data1.iloc[:, 0] # 挑出需要修改的行 re1 = re.compile( "uncultured|Incertae_Sedis$|Unknown|Subgroup_|norank|Family_", flags=re.IGNORECASE) # 门到属包含指定字符的行 bl1 = data3['tax'].apply(lambda x: bool(re1.search(x))) re2 = re.compile(r'[dpcofgs]__', re.I) data4_1 = data3.loc[bl1, :].replace(re2, "").drop(['tax'], axis=1) #data4_1.to_csv("test.txt",header=0,index=0) for i in range(1, data4_1.shape[1] - 2): # i = 4 for k in range(data4_1.shape[0]): # k = 278 if bool( re.search(data4_1.iloc[k, i] + "$", data4_1.iloc[k, i - 1], re.I)): data4_1.iloc[k, i] = data4_1.iloc[k, i - 1] elif bool(re.search(re1, data4_1.iloc[k, i])): data4_1.iloc[k, i] = data4_1.iloc[k, i - 1] + "_" + data4_1.iloc[k, i] for i in range(0, data4_1.shape[1] - 1): # i = 4 for k in range(data4_1.shape[0]): # k = 278 if data4_1.iloc[k, i] != "": data4_1.iloc[k, i] = data4_1.columns[i] + data4_1.iloc[k, i] data4_1['tax'] = data4_1.iloc[:, :-1].apply( lambda row: ';'.join(row.values.astype(str)), axis=1) # 合并 data4_2 = data3.loc[~bl1, :] regex2 = re.compile(r'(_[0-9]*$)') data5 = pd.concat([data4_1[data4_2.columns.tolist()], data4_2]).replace(regex2, "") data5.index = data5["OTU_ID"] data6 = np.array(data5.loc[data1.iloc[:, 0].tolist(), :]).tolist() data7 = DataFrame([[data6[pp][-1], ";".join(data6[pp][0:7])] for pp in range(len(data6))]) data7.replace(";+", ";", regex=True, inplace=True) data7.replace(";$", "", regex=True, inplace=True) data1.iloc[:, taxwz] = data7.iloc[:, 1] data1.to_csv("fix_" + q, sep='\t', na_rep='', index=0) # ,header=0
def process(self, instrucction, table: DataFrame, name): try: if isinstance(self.condition, Relop) or isinstance( self.condition, LogicalOperators): value = self.condition.process(instrucction) if isinstance(value, list): list_alias = value[0] table = self.create_temporal_tables(list_alias, value[2]) query = value[1] table = table.query(query) table.columns = self.change_name_column( table.columns.tolist(), value[2]) else: table = table.query(value) elif isinstance(self.condition, LikeClause): value = self.condition.process(instrucction) table = table.query(value) elif isinstance(self.condition, Between): value = self.condition.process(instrucction) table = table.query(value) elif isinstance(self.condition, isClause): value = self.condition.process(instrucction) table = table.query(value) elif isinstance(self.condition, InClause): value = self.condition.process(instrucction) table = table.query(value) elif isinstance(self.condition, ExistsClause): value = self.condition.process(instrucction) try: value_aux = value result = table.columns.intersection(value_aux.columns) list_col = list(result) table = table[list_col].isin(value_aux[list_col]) except: desc = "FATAL ERROR, murio porque usaste where con columnas de otra tabla, F" ErrorController().add(34, 'Execution', desc, 0, 0) elif isinstance(self.condition, list): not_c = self.condition[0] condition = self.condition[1] value = condition.process(instrucction) try: value_aux = value result = table.columns.intersection(value_aux.columns) list_col = list(result) table = ~table[list_col].isin(value_aux[list_col]) except: desc = "FATAL ERROR, murio porque usaste where con columnas de otra tabla, F" ErrorController().add(34, 'Execution', desc, 0, 0) # al fin xd print(table) storage_columns(table.values.tolist(), table.columns.tolist(), 0, 0) storage_table(table.values.tolist(), table.columns.tolist(), name, 0, 0) return table except: desc = "FATAL ERROR, murio en Where, F" ErrorController().add(34, 'Execution', desc, 0, 0)
def names_to_bigquery(): #upload 2008to2017 for i in range(len(years)): data = pd.read_csv(f'./data/yob{years[i]}.txt') data = DataFrame(data) data.columns = ['name', 'gender', 'count'] data.to_gbq(destination_table=f'mm.{years[i]}', project_id=project_id) print(f"uploaded{years[i]}")
def get_result(self): # series only if self._is_series: # stack blocks if self.axis == 0: # concat Series with length to keep dtype as much non_empties = [x for x in self.objs if len(x) > 0] if len(non_empties) > 0: values = [x._values for x in non_empties] else: values = [x._values for x in self.objs] new_data = com._concat_compat(values) name = com._consensus_name_attr(self.objs) return (Series(new_data, index=self.new_axes[0], name=name, dtype=new_data.dtype).__finalize__( self, method='concat')) # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') # combine block managers else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers(mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() return (self.objs[0]._from_axes( new_data, self.new_axes).__finalize__(self, method='concat'))
def str_extract(arr, pat, flags=0): """ Find groups in each string using passed regular expression Parameters ---------- pat : string Pattern or regular expression flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE Returns ------- extracted groups : Series (one group) or DataFrame (multiple groups) Notes ----- Compare to the string method match, which returns re.match objects. """ regex = re.compile(pat, flags=flags) # just to be safe, check this if regex.groups == 0: raise ValueError("This pattern contains no groups to capture.") elif regex.groups == 1: def f(x): if not isinstance(x, compat.string_types): return None m = regex.search(x) if m: return m.groups()[0] # may be None else: return None else: empty_row = Series(regex.groups * [None]) def f(x): if not isinstance(x, compat.string_types): return empty_row m = regex.search(x) if m: return Series(list(m.groups())) # may contain None else: return empty_row result = arr.apply(f) result.replace({None: np.nan}, inplace=True) if regex.groups > 1: result = DataFrame(result) # Don't rely on the wrapper; name columns. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) result.columns = [names.get(1 + i, i) for i in range(regex.groups)] else: result.name = regex.groupindex.get(0) return result
def getPercentile(self, df, trg_percentile): percentile = df.quantile(trg_percentile, axis=0) percentile_df = DataFrame(percentile) column_name = trg_percentile * 100 percentile_df.columns = [str(column_name)] return percentile_df
def write_to_csv(self): nw_df = DataFrame(list(self.lst)) nw_df.columns = ['Redirect count','ssl_classification','url_length','hostname_length','subdomain_count','at_sign_in_url','exe_extension_in_request_url','exe_extension_in_landing_url', 'ip_as_domain_name','no_of_slashes_in requst_url','no_of_slashes_in_landing_url','no_of_dots_in_request_url','no_of_dots_in_landing_url','tld_value','age_of_domain', 'age_of_last_modified','content_length','same_landing_and_request_ip','same_landing_and_request_url'] frames = [self.df['label'],self.df2['label']] new_df = pd.concat(frames) new_df = new_df.reset_index() nw_df['label'] = new_df['label'] nw_df.to_csv('dataset1.csv',sep=',', encoding='latin-1')
def get_result(self): # series only if self._is_series: # stack blocks if self.axis == 0: # concat Series with length to keep dtype as much non_empties = [x for x in self.objs if len(x) > 0] if len(non_empties) > 0: values = [x._values for x in non_empties] else: values = [x._values for x in self.objs] new_data = com._concat_compat(values) name = com._consensus_name_attr(self.objs) return (Series(new_data, index=self.new_axes[0], name=name, dtype=new_data.dtype) .__finalize__(self, method='concat')) # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') # combine block managers else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() return (self.objs[0]._from_axes(new_data, self.new_axes) .__finalize__(self, method='concat'))
def Main_Mode_Function(df,GroupKey,JoinKey,Var,Leakage=False): tmp = copy.deepcopy(df) tmp_list = [list(tmp[JoinKey]), list(tmp[GroupKey]), list(tmp[Var].astype(str)), [0]*tmp.shape[0] ] data=[] for j in range(len(tmp_list[0])): data.append([tmp_list[i][j] for i in range(len(tmp_list))]) GroupKey_bool = None;start_point=0 for i in range(len(data)): if data[i][1]!=GroupKey_bool: if (Leakage==True and i>0): save_point=i Leakage_Mode(data,mode_dic,start_point,save_point) start_point=save_point GroupKey_bool=data[i][1] mode_dic={} mode_dic[data[i][2]]=mode_dic.get(data[i][2],0)+1 data[i][2] = [data[i][2]] data[i][3] = len([data[i][2]]) else: mode_dic[data[i][2]]=mode_dic.get(data[i][2],0)+1 mode_value = Get_Mode(mode_dic) data[i][2] = mode_value data[i][3] = len(mode_value) save_point=i+1 Leakage_Mode(data,mode_dic,start_point,save_point) data=DataFrame(data) if Leakage==False: data.columns = [str(JoinKey),str(GroupKey),Var+"_mode",Var+"_mode_count"] else: data.columns = [str(JoinKey),str(GroupKey),Var+"_mode",Var+"_mode_count",Var+"_mode_Leakage"] globals()[Var + "_ModeFrame" ] = data print("Done: "+Var+"_ModeFrame")
def getTraceData(self, reportShortName, conceptName, CIK, periodType): rs = FactDao.getFactValues2(reportShortName=reportShortName, conceptName=conceptName, CIK=CIK, periodType=periodType) rows = rs.fetchall() if (len(rows) != 0): df = DataFrame(rows) df.columns = rs.keys() trace = go.Scatter(x=df["date_"], y=df["value"], name=conceptName) return trace else: raise Exception("No data found " + conceptName)
def get_result(self): if self._is_series and self.axis == 0: new_data = com._concat_compat([x.values for x in self.objs]) name = com._consensus_name_attr(self.objs) return Series(new_data, index=self.new_axes[0], name=name) elif self._is_series: data = dict(itertools.izip(xrange(len(self.objs)), self.objs)) tmpdf = DataFrame(data, index=self.new_axes[0]) tmpdf.columns = self.new_axes[1] return tmpdf else: new_data = self._get_concatenated_data() return self.objs[0]._from_axes(new_data, self.new_axes)
def bingxing(filename): start = end = time.clock() host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ppPj2gyINjoYiqkhsjAnyYDC&client_secret=2Q6tsZrbGsE60pXuoxg5o5AOUDCSMaLP' header = {'Content-Type': 'application/json; charset=UTF-8'} r = requests.post(host, headers=header) r = json.loads(r.text) Access_token = r['access_token'] f = open(filename, 'rb') img = base64.b64encode(f.read()) data = {"image": img, "templateSign": "7dc32854acac2c3bac8d3bb599ceaeca"} ocr_host = 'https://aip.baidubce.com/rest/2.0/solution/v1/iocr/recognise?access_token=' + Access_token ocr_header = { 'Content-Type': 'application/x-www-form-urlencoded', "apikey": "ppPj2gyINjoYiqkhsjAnyYDC" } img = requests.post(ocr_host, headers=ocr_header, data=data) img = json.loads(img.text) ocr_res = img["data"]["ret"] sim_res = [i['word'] for i in ocr_res] testdata = DataFrame(sim_res[1::2]).T testdata.columns = sim_res[0::2] testdata.columns testdata = testdata.rename( columns={ '中性细胞比率': '中性粒细胞百分比', '淋巴细胞(%)': '淋巴细胞百分比', '嗜酸性粒细胞比': '嗜酸性粒细胞百分比', '嗜酸性粒细胞比': '嗜酸性粒细胞百分比', '中性细胞数': '中性粒细胞计数', '淋巴细胞值': '淋巴细胞数计数', '单核细胞百分比': '单核细胞', '嗜酸性粒细胞': '嗜酸性粒细胞计数', '嗜碱性粒细胞': '嗜碱性粒细胞计数', '红细胞平均体积': '平均红细胞体积', '平均血红蛋白量': '平均血红蛋白', '红细胞分布宽度': '红细胞分布宽度变异系数', '平均血小板体积': '血小板平均体积', '血小板分布宽度': '血小板平均分布宽度' }) testdata = testdata.apply(pd.to_numeric, errors='ignore') xtest = testdata[np.array(rowname)[clf.feature_importances_ >= 0.03]] #print(xtest) prob = model.predict_proba(xtest).tolist()[0] if model.predict(xtest): print('该人得有肾病,概率为%f' % prob[1]) else: print('该人未得肾病,概率为%f' % prob[0]) end = time.clock() print('运行时间为' + str(end - start) + '秒')
def runModelOnTest(testFilePath): classLabels = featureAndValueMapping.get(featureList[len(featureList) -1]) classLabelCount = len(classLabels) ConfusionMatrix = [[0 for x in range(int(classLabelCount))] for x in range(int(classLabelCount))] input=open(testFilePath,'rU') csvObject=csv.reader(input) for row in csvObject: predictedLabel=classify(row[:len(row)-1]) ConfusionMatrix[int(row[len(row)- 1])][int(predictedLabel)] += 1 # print "Actual label : "+row[len(row)- 1]+"Class label : "+classify(row[:len(row)-1]) df = DataFrame(ConfusionMatrix) df.columns = classLabels df.index = classLabels print df
def get_idList(self, bw_id=None): with open(self.filename, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) idList = [row['bw_id'] for row in reader] if self.temp: # 减少转发紊乱导致的重复爬取 # 用set会重新排序,则断点失效 df = DataFrame(idList) df.columns = ['bw_id'] df = df.drop_duplicates(keep='last') idList = df['bw_id'] idList = idList.tolist() if bw_id: pos = idList.index(bw_id) # 必须为字符串形式 idList = idList[pos + 1:] return idList
def toDB_record(functionname, list, remark): try: start = datetime.datetime.now() if (functionname != 'get_rise'): _excutesql("delete from t_record where date=" + c.DATE + " AND type=" + functionname) finally: if list: df = DataFrame(list) df.columns = ['code'] df.insert(0, 'date', c.DATE) df.insert(2, 'type', functionname) df.insert(3, 'remark', remark) df.to_sql('t_record', c.ENGINE, if_exists='append') end = datetime.datetime.now() print("TODB: " + str(end - start))
def get_result(self): if self._is_series and self.axis == 0: new_data = com._concat_compat([x.get_values() for x in self.objs]) name = com._consensus_name_attr(self.objs) new_data = self._post_merge(new_data) return Series(new_data, index=self.new_axes[0], name=name) elif self._is_series: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) if columns is not None: tmpdf.columns = columns return tmpdf else: new_data = self._get_concatenated_data() new_data = self._post_merge(new_data) return self.objs[0]._from_axes(new_data, self.new_axes)
def do_load(self): all_unimported = IncomingSalesforceRecord.get_unimported() object_types = all_unimported.select(IncomingSalesforceRecord.object_type).distinct() for obj in object_types: unimported_recs = all_unimported.select().where(IncomingSalesforceRecord.object_type==obj.object_type) unimported_dicts = [json.loads(rec.record) for rec in unimported_recs] for d in unimported_dicts: d['url'] = d['attributes']['url'] del d['attributes'] for k,v in d.iteritems(): if isinstance(v, dict): d[k] = json.dumps(v) df = DataFrame(unimported_dicts) df.columns = [colname.lower() for colname in df.columns] table_name = 'sf_%s' % (obj.object_type.lower()) logger.info('Writing records for Salesforce object %s to db table %s' % (obj.object_type,table_name)) df.to_sql(table_name, self.engine, flavor='postgresql', if_exists='replace', index=False, index_label=None)
def save2DB(): with open( "/Users/admin/Desktop/doc/finance/multifactor/data/industry/sina_config_data.txt", 'r') as f: configstr = f.read().replace("\\'", "'") ldict = json.loads(configstr) #申万二级 ind = ldict[1][0][1][3][1] allNodes = [] # for ind2 in ind: # ind3 = ind2[1] makeTopNode(ind, allNodes, "热门概念") data = {'indcode'} # print(ldict) pdind = DataFrame(allNodes) pdind.columns = [ 'indcode', 'indname', 'level', 'par_indcode', 'par_indname', 'classname' ] # pdmean.to_sql('statistic2', engine) pdind.to_sql('industry', engine, if_exists='append')
def parse(): df_data = pd.read_excel('bin.xlsx') # 默认读取前5行的数据 data = df_data.head() print(data) svc_dict = list() for row in df_data.itertuples(): name = getattr(row, '发卡行名称') length = getattr(row, '长度') val = getattr(row, '取值') c_type = getattr(row, '卡种') datepat = re.compile(r'\(.*?\)') b_name = re.sub(datepat, '1', name.replace('\n', '')) body = (b_name.replace('1', ''), length, val, c_type) svc_dict.append(body) df_list = DataFrame(svc_dict) df_list.columns = ['发卡行名称', '长度', '取值', '卡种'] df_list.to_csv('bin.csv', encoding='utf_8_sig') print("finished")
def parse(): df_list = DataFrame() for url in urls: response = requests.get(url, headers=headers).text # 转化为字符串 json_str = json.loads(response) # 大title title = json_str['info']['title'] print(title) service_path = json_str['paths'] svc_dict = list() for svc, data in service_path.items(): req = data.get('post') req_method = 'post' if req == '' or req is None: req = data.get('get') req_method = 'get' if req == '' or req is None: req = data.get('put') req_method = 'put' if req == '' or req is None: req = data.get('delete') req_method = 'delete' if req is not None: body = (title, svc, req.get('summary'), req_method) svc_dict.append(body) if df_list.empty: df_list = DataFrame(svc_dict) else: df_list = df_list.append(DataFrame(svc_dict)) df_list.columns = ['title', 'url', 'description', 'method'] if os.path.exists(file_name): os.remove(file_name) df_list.to_csv('svc.csv', encoding='utf_8_sig') print("finished")
def ll_to_grid(ll_data_2g): """ grid_num 是从1开始编号的 :param ll_data_2g: :return: """ # y_box_num = int((haversine(lb_Longitude, lb_Latitude, lb_Longitude, rt_Latitude))/per_len) + 1 # X_box_num = int((haversine(lb_Longitude, lb_Latitude, rt_Longitude, lb_Latitude))/per_len) + 1 # print(X_box_num) # print(y_box_num) # print(ll_data_2g) ll_data_2g_list = ll_data_2g.as_matrix().tolist() for row in ll_data_2g_list: lon = row[2] lat = row[3] # grid_index = calculate_grid(lb_Latitude, lb_Longitude, lat, lon) y_length = haversine(lb_Longitude, lb_Latitude, lb_Longitude, lat) X_length = haversine(lb_Longitude, lb_Latitude, lon, lb_Latitude) y = int(y_length / per_len) X = int(X_length / per_len) if y_length % per_len != 0: y += 1 if X_length % per_len != 0: X += 1 grid_num = X + (y - 1) * X_box_num row.append(grid_num) indexs = ll_data_2g.columns.values.tolist() indexs.append('grid_num') train_data = DataFrame(ll_data_2g_list) train_data.columns = indexs # print(train_data) return train_data
def export_analyze_result(self, arg_symbol): if not os.path.exists(DEF_EXPORT_FOLDER_NAME): os.makedirs(DEF_EXPORT_FOLDER_NAME) main_data = DataFrame(list(self.__data)) main_data.columns = DEF_STOCK_COULMN_NAME result_data = [] result_data.append(main_data) result_data += self.__result df = pd.concat(result_data, axis=1) filename = datetime.today().strftime(arg_symbol + "_%Y%m%d-%H%M%S.csv") try: df.to_csv(DEF_EXPORT_FOLDER_NAME + os.sep + filename, sep=',', encoding='utf-8') except: return None return filename
def str_extract(arr, pat, flags=0): """ Find groups in each string using passed regular expression Parameters ---------- pat : string Pattern or regular expression flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE Returns ------- extracted groups : Series (one group) or DataFrame (multiple groups) Examples -------- A pattern with one group will return a Series. Non-matches will be NaN. >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') 0 1 1 2 2 NaN dtype: object A pattern with more than one group will return a DataFrame. >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') A pattern may contain optional groups. >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)') Named groups will become column names in the result. >>> Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)') """ regex = re.compile(pat, flags=flags) # just to be safe, check this if regex.groups == 0: raise ValueError("This pattern contains no groups to capture.") elif regex.groups == 1: def f(x): if not isinstance(x, compat.string_types): return None m = regex.search(x) if m: return m.groups()[0] # may be None else: return None else: empty_row = Series(regex.groups * [None]) def f(x): if not isinstance(x, compat.string_types): return empty_row m = regex.search(x) if m: return Series(list(m.groups())) # may contain None else: return empty_row result = arr.apply(f) result.replace({None: np.nan}, inplace=True) if regex.groups > 1: result = DataFrame(result) # Don't rely on the wrapper; name columns. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) result.columns = [names.get(1 + i, i) for i in range(regex.groups)] else: result.name = regex.groupindex.get(0) return result
def parse(self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): _validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() for asheetname in sheets: if verbose: print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, compat.string_types): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) if sheet.nrows == 0: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header(data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser(data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) elif compat.PY2: output[asheetname].columns = _maybe_convert_to_string( output[asheetname].columns) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def strat_maLong_maShort( df=readYahoo("SPY"), maLongDays=10, maShortDays=3, closeCol="Close", highCol="High", lowCol="Low", openCol="Open", signOfTrade=1, printit=True, block=False, ): """ execute strategy which enters and exit based on Moving Average crossovers Example: from pystrats.state_strats import strat_maLong_maShort as ss dfretfinal = ss() #strat_maLong_maShort() print dfretfinal print dfretfinal['ret'].mean() """ close = np.array(df[closeCol]) high = np.array(df[highCol]) low = np.array(df[lowCol]) open = np.array(df[openCol]) date = np.array(df["Date"]) ma10 = rolling_mean(close, maLongDays) ma9 = rolling_mean(close, maLongDays - 1) ma3 = rolling_mean(close, maShortDays) ma2 = rolling_mean(close, maShortDays - 1) n = len(df) nl = n - 1 # pMa10 = dsInsert(ma10[0:nl],0,None) # pMa9 = dsInsert(ma9[0:nl],0,None) # pMa3 = dsInsert(ma3[0:nl],0,None) # pMa2 = dsInsert(ma2[0:nl],0,None) pMa10 = np.insert(ma10[0:nl], 0, None) pMa9 = np.insert(ma9[0:nl], 0, None) pMa3 = np.insert(ma3[0:nl], 0, None) pMa2 = np.insert(ma2[0:nl], 0, None) pClose = np.insert(close[0:nl], 0, None) pHigh = np.insert(high[0:nl], 0, None) pLow = np.insert(low[0:nl], 0, None) # initialize state vector state = np.array([1] * n) # loop start_i = maLongDays + 1 for i in range(start_i, n): if (pClose[i] < pMa10[i]) & (state[i - 1] == 1) & (high[i] > pMa9[i]): state[i] = 2 elif (state[i - 1] == 2) & (low[i] > pMa2[i]): state[i] = 2 elif (state[i - 1] == 2) & (low[i] <= pMa2[i]): state[i] = 1 pState = np.insert(state[0:nl], 0, 1) # create entry conditions # 1. initial entry (state 1 to state 2) e1_2 = np.array((pState == 1) & (state == 2)) e2_2 = np.array((pState == 2) & (state == 2)) e2_1 = np.array((pState == 2) & (state == 1)) dfret = DataFrame([date, pHigh, pLow, pClose, pMa10, pMa9, pMa3, pMa2]).T dfret.columns = ["Date", "pHigh", "pLow", "pClose", "pMa10", "pMa9", "pMa3", "pMa2"] # create daily entry prices dailyEntryPrices = np.array([0] * n) # default entry dailyEntryPrices = asb(dailyEntryPrices, pMa9, e1_2) useCloseOnEntry = e1_2 & (low > pMa9) dailyEntryPrices = asb(dailyEntryPrices, close, useCloseOnEntry) dailyEntryPrices = asb(dailyEntryPrices, pClose, e2_2) dailyEntryPrices = asb(dailyEntryPrices, pClose, e2_1) dfret["entry"] = dailyEntryPrices # create DAILY settle prices, which are either 0 or the Close # dfret$Close <- close dailySettlePrices = np.array([0] * n) dailySettlePrices = asb(dailySettlePrices, close, e1_2) # <- close[w1_2] dailySettlePrices = asb(dailySettlePrices, close, e2_2) # dailySettlePrices[w2_2] <- close[w2_2] dailySettlePrices = asb(dailySettlePrices, pMa2, e2_1) # dailySettlePrices[w2_1] <- pMa2[w2_1] # adjust for situations where the high is below the pMa2, so you get out at the close useCloseOnExit = e2_1 & (high < pMa2) dailySettlePrices = asb( dailySettlePrices, close, useCloseOnExit ) # dailySettlePrices[useCloseOnExit] <- close[useCloseOnExit] dfret["exit"] = dailySettlePrices dfret["ret"] = dfret["exit"] / dfret["entry"] - 1 dfret["ret"].fillna(0) dfretfinal = dfret.dropna(0) # dfretfinal <- dfret[-badrows(dfret),] if printit: retDf = DataFrame({"Date": dfretfinal["Date"], "ret": dfretfinal["ret"]}) returnsPerformance(retDf, block=block) return dfretfinal
stream = (line.decode('cp1251').strip().encode('utf-8') for line in stdin) # tee the stream to get the metadata for title stream, stream_2 = tee(stream) title = get_metadata(stream_2)['TITLE'] df = DataFrame() for cur_data in iter_contextual_atom_data(stream): current = DataFrame.from_dict([cur_data]) df = df.append(current, ignore_index=False) index_cols = list(df.columns.values) index_cols.remove('value') df.set_index(index_cols, inplace=True) df.columns = [title] # create removable temp file for use with HDFStore tmpfile = NamedTemporaryFile().name store = HDFStore(tmpfile) store['default'] = df store.close() # put h5 file to stdout with open(tmpfile, 'rb') as f: print f.read() # temp file is automatically removed