def makeOriginDataCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, stock_id=None): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or output_file is None or stock_id is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) data = cur.execute( "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) data = cur.fetchall() if len(data) == 0: return None res = [] for d in data: res.append([ int(d[0]), int(d[1]), str(d[2]), float(d[3]), float(d[4]), float(d[5]), float(d[6]), float(d[7]), float(d[8]), float(d[9]), float(d[10]) ]) new_data = [] for d in zip(*res): new_data.append(d) origin_data = { 'id': new_data[0], 'stock_id': new_data[1], 'date': new_data[2], 'opening': new_data[3], 'closing': new_data[4], 'difference': new_data[5], 'percentage_difference': new_data[6], 'lowest': new_data[7], 'highest': new_data[8], 'volume': new_data[9], 'amount': new_data[10] } #读取原始数据,只保留需要使用的列 total_data = DataFrame(origin_data) total_data.sort_values(by=['stock_id', 'date'], inplace=True) #根据股票代码分组 g_stock_num = total_data.groupby(by=["stock_id"]) total_data["rate"] = 100 * (g_stock_num.shift(0)["closing"] / g_stock_num.shift(1)["closing"] - 1) for i in total_data.index: total_data.loc[i, 'rate'] = str( np.round(float(total_data['rate'][i]), 2)) #重新调整列的顺序,为接下来处理成输入、输出形式做准备 columns = [ "stock_id", "date", "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "rate" ] total_data = total_data[columns] def func_train_data(data_one_stock_num): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_one_stock_num.name) data = { "stock_id": [], "date": [], "opening": [], "closing": [], "difference": [], "percentage_difference": [], "lowest": [], "highest": [], "volume": [], "amount": [], "rate": [] } for i in range(len(data_one_stock_num.index) - 1): for k in data: data[k].append(data_one_stock_num.iloc[i][k]) pd.DataFrame(data).to_csv(output_path, index=False, columns=columns) total_data1 = total_data.dropna() total_data2 = total_data1.drop( total_data1[(total_data1.rate == 'nan')].index) g_stock_num = total_data2.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock_num.apply(func_train_data)
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na: bool = False, sparse: bool = False, drop_first: bool = False, dtype: Dtype | None = None, ) -> DataFrame: from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" dtype = np.dtype(dtype) # type: ignore[arg-type] if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: dummy_cols = Index( [f"{prefix}{prefix_sep}{level}" for level in levels]) index: Index | None if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: bool | float | int if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) # TODO: overload concat with Literal for axis out = cast(DataFrame, out) return out else: # take on axis=1 + transpose to ensure ndarray layout is column-major dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds, ): validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = fill_mi_header(data[row], control_row) if index_col is not None: header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname ].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list( zip(*[ lev.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_labels = sorted(set(this.columns.labels[-1])) level_vals_used = level_vals[level_labels] levsize = len(level_labels) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len == 0: drop_cols.append(key) continue elif slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_mixed_type: value_slice = this.loc[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def parse_csv(self): # Read the csv file, and skip the first row as it's a long string label name survey_data = pd.read_csv(self.in_filename)[1:] bb_survey_flags = [ '2', '<strong>B. I want to record my experiences during the day today (please complete before going to bed).</strong>' ] # Before sleep survey data bb_survey = survey_data.loc[ survey_data['QID20'] != '<strong>A. I want to record my sleep last night (please complete upon awakening).</strong>'] # Upon awakening survey data ab_survey = survey_data.loc[ survey_data['QID20'] == '<strong>A. I want to record my sleep last night (please complete upon awakening).</strong>'] # Define a before sleep DataFrame bb_df = DataFrame() bb_df['User'] = bb_survey['V3'] bb_df['Date'] = bb_survey['V8'].apply(to_ymdstr) bb_df['Day'] = bb_survey['V8'].apply(find_weekday_ymdhms) # Create empty submission times first, fill it later bb_df['MULT'] = '' bb_df['NAPN'] = bb_survey['QID27'].fillna(BLANK_E) bb_df['NAPT'] = bb_survey['QID11#2_1'].fillna(0).apply( hour_to_mins) + bb_survey['QID11#1_1'].fillna(0).apply(str_to_int) bb_df['ALN'] = bb_survey['QID15#3_1_1_TEXT'].fillna(BLANK_E) # ALT alt_series = bb_survey['QID15#2_1'].fillna( BLANK_NA) + ":" + bb_survey['QID15#1_1'].fillna(MM_ZERO) bb_df['ALT'] = alt_series.apply(fill_for_hhmm) bb_df['CAFN'] = bb_survey['QID23#3_1_1_TEXT'].fillna(BLANK_E) # CAFT caft_series = bb_survey['QID23#2_1'].fillna( BLANK_NA) + ":" + bb_survey['QID23#1_1'].fillna(MM_ZERO) bb_df['CAFT'] = caft_series.apply(fill_for_hhmm) # Parse SMED smed_df = DataFrame() smed_df['SMED'] = bb_survey['QID18'] smed_df['SMED1'] = bb_survey['QID17#3_1_1_TEXT'].fillna(BLANK_E) smed_df['SMED1T_HH'] = bb_survey['QID17#2_1'].fillna(BLANK_NA) smed_df['SMED1T_MM'] = bb_survey['QID17#1_1'].fillna(MM_ZERO) smed_df['SMED2'] = bb_survey['QID17#3_2_1_TEXT'].fillna(BLANK_E) smed_df['SMED2T_HH'] = bb_survey['QID17#2_2'].fillna(BLANK_NA) smed_df['SMED2T_MM'] = bb_survey['QID17#1_2'].fillna(MM_ZERO) smed_df['SMED3'] = bb_survey['QID17#3_3_1_TEXT'].fillna(BLANK_E) smed_df['SMED3T_HH'] = bb_survey['QID17#2_3'].fillna(BLANK_NA) smed_df['SMED3T_MM'] = bb_survey['QID17#1_3'].fillna(MM_ZERO) smed_df = smed_df.apply(process_smed, axis=1) bb_df['SMED'] = smed_df['SMED'] bb_df['SMED1'] = smed_df['SMED1'] bb_df['SMED1T'] = smed_df['SMED1T'] bb_df['SMED2'] = smed_df['SMED2'] bb_df['SMED2T'] = smed_df['SMED2T'] bb_df['SMED3'] = smed_df['SMED3'] bb_df['SMED3T'] = smed_df['SMED3T'] bb_df['NOTEBB'] = bb_survey['QID19'].fillna(BLANK_E) bb_df['ATTEMPT'] = '' bb_df['BT'] = '' bb_df['LO'] = '' bb_df['WT'] = '' bb_df['RT'] = '' bb_df['SOL'] = '' bb_df['SNZ'] = '' bb_df['TST'] = '' bb_df['WASON'] = '' bb_df['WASOT'] = '' bb_df['EA'] = '' bb_df['EAT'] = '' bb_df['SQ'] = '' bb_df['REST'] = '' bb_df['NOTEWU'] = '' bb_df['TIB'] = '' bb_df['SE1'] = '' bb_df['SE2'] = '' # process MULT bb_df['MULT'] = self.process_mult(bb_df) # test code # bb_df.to_csv('before_bed_survey.csv', index=False) # End of before sleep # Start for Upon awakening ab_df = DataFrame() ab_df['User'] = ab_survey['V3'] ab_df['Date'] = ab_survey['V8'].apply(reduce_one_day_ymdstr) ab_df['Day'] = ab_df['Date'].apply(find_weekday_ymd) # submission times ab_df['MULT'] = '' ab_df['NAPN'] = '' ab_df['NAPT'] = '' ab_df['ALN'] = '' ab_df['ALT'] = '' ab_df['CAFN'] = '' ab_df['CAFT'] = '' ab_df['SMED'] = '' ab_df['SMED1'] = '' ab_df['SMED1T'] = '' ab_df['SMED2'] = '' ab_df['SMED2T'] = '' ab_df['SMED3'] = '' ab_df['SMED3T'] = '' ab_df['NOTEBB'] = '' tmp_ab_df = DataFrame() tmp_ab_df['Date'] = ab_df['Date'] tmp_ab_df['ATTEMPT'] = ab_survey['QID24'].fillna('Yes').apply( check_for_attempt) tmp_ab_df['BT'] = ab_survey['QID2#2_1'].fillna( BLANK_NA) + ":" + ab_survey['QID2#1_1'].fillna(MM_ZERO) tmp_ab_df['LO'] = ab_survey['QID2#2_2'].fillna( BLANK_NA) + ":" + ab_survey['QID2#1_2'].fillna(MM_ZERO) tmp_ab_df['WT'] = ab_survey['QID2#2_3'].fillna( BLANK_NA) + ":" + ab_survey['QID2#1_3'].fillna(MM_ZERO) tmp_ab_df['RT'] = ab_survey['QID2#2_4'].fillna( BLANK_NA) + ":" + ab_survey['QID2#1_4'].fillna(MM_ZERO) tmp_ab_df['SOL'] = ab_survey['QID3#2_1'].fillna(0).apply( hour_to_mins) + ab_survey['QID3#1_1'].fillna(0).apply(str_to_int) tmp_ab_df['SNZ'] = ab_survey['QID3#2_2'].fillna(0).apply( hour_to_mins) + ab_survey['QID3#1_2'].fillna(0).apply(str_to_int) tmp_ab_df['TST'] = ab_survey['QID3#2_3'].fillna(0).apply( hour_to_mins) + ab_survey['QID3#1_3'].fillna(0).apply(str_to_int) tmp_ab_df['WASON'] = ab_survey['QID6#3_1_1_TEXT'].fillna(BLANK_E) tmp_ab_df['WASOT'] = ab_survey['QID6#2_1'].fillna(0).apply( hour_to_mins) + ab_survey['QID6#1_1'].fillna(0).apply(str_to_int) tmp_ab_df['EA'] = ab_survey['QID26'].fillna(BLANK_E) tmp_ab_df['EAT'] = ab_survey['QID7#2_1'].fillna(0).apply( hour_to_mins) + ab_survey['QID7#1_1'].fillna(0).apply(str_to_int) tmp_ab_df['SQ'] = ab_survey['QID5'].apply(fill_for_rank) tmp_ab_df['REST'] = ab_survey['QID8'].apply(fill_for_rank) tmp_ab_df = tmp_ab_df.apply(process_awaken, axis=1) ab_df['ATTEMPT'] = tmp_ab_df['ATTEMPT'] ab_df['BT'] = tmp_ab_df['BT'] ab_df['LO'] = tmp_ab_df['LO'] ab_df['WT'] = tmp_ab_df['WT'] ab_df['RT'] = tmp_ab_df['RT'] ab_df['SOL'] = tmp_ab_df['SOL'] ab_df['SNZ'] = tmp_ab_df['SNZ'] ab_df['TST'] = tmp_ab_df['TST'] ab_df['WASON'] = tmp_ab_df['WASON'] ab_df['WASOT'] = tmp_ab_df['WASOT'] ab_df['EA'] = tmp_ab_df['EA'] ab_df['EAT'] = tmp_ab_df['EAT'] ab_df['SQ'] = tmp_ab_df['SQ'] ab_df['REST'] = tmp_ab_df['REST'] ab_df['NOTEWU'] = ab_survey['QID28'].fillna(BLANK_E) ab_df['TIB'] = tmp_ab_df['TIB'] ab_df['SE1'] = tmp_ab_df['SE1'] ab_df['SE2'] = tmp_ab_df['SE2'] # test code # ab_df.to_csv('after_bed_survey.csv', index=False) # Process MULT ab_df['MULT'] = self.process_mult(ab_df) # Merge two types of surveys together self.survey_new_csv = bb_df.append(ab_df, ignore_index=True) # sorting it first self.survey_new_csv = self.survey_new_csv.sort(['User', 'Date'], ascending=[1, 1]) # the combined_dulicated_dfs will hold the combined duplicated records combined_duplicated_dfs = [] # get all unique patient ids self.patient_ids = self.survey_new_csv.User.unique().tolist() for index, row in self.survey_new_csv.iterrows(): user_id = row['User'] date = row['Date'] mult = row['MULT'] key = '{}'.format(user_id) + '{}'.format(date) + '{}'.format(mult) found_index = self.get_survey_data_from_dict(key) # TODO: remove this temporary solution # if user_id == '1504': # print('-- Removed the USER ID 1504 Record temporarily due to generate pdf error in R') # self.survey_new_csv.drop([index], inplace=True) if found_index is None: self.set_survey_data_in_dict(key, index) else: duplicated_df = DataFrame(self.survey_new_csv, index=[found_index, index]) # print('------ duplicated df: {}'.format(duplicated_df)) # we drop these duplicated df recordes self.survey_new_csv.drop([found_index, index], inplace=True) # combines these two duplicated dfs into one combined_df = self.combine_rows(duplicated_df) # print('------ combined df: {}'.format(combined_df)) # append this into combined_duplicated_dfs list combined_duplicated_dfs.append(combined_df) # concat these combined duplicated df list all_duplicated = pd.concat(combined_duplicated_dfs) # append it into survey new csv file self.survey_new_csv = self.survey_new_csv.append(all_duplicated, ignore_index=True)
rp_nat_ = [] rps_nat_ = [] tp_nat_ = [] tps_nat_ = [] dps_nat_ = [] alt_nat_ = [] for i in range(len(flow_num_str)): savename_for = "/Users/user/Desktop/plot/latency_for_" + flow_num_str[ i] + ".csv" savename_nat = "/Users/user/Desktop/plot/latency_nat_" + flow_num_str[ i] + ".csv" savename_for_rp = "/Users/user/Desktop/plot/throughput_for_" + flow_num_str[ i] + ".csv" savename_nat_rp = "/Users/user/Desktop/plot/throughput_nat_" + flow_num_str[ i] + ".csv" rp_for, tp_for, tps_for, dps_for, alt_for = read_files( fileName_for[i], flow_num[i]) alt_for.to_csv(savename_for) rp_nat, tp_nat, tps_nat, dps_nat, alt_nat = read_files( fileName_nat[i], flow_num[i]) rps_for, rps_nat = get_throughput_speed(fileName_for[i], fileName_nat[i]) a = {'0': rps_for} b = {'0': rps_nat} rp_df_for = DataFrame(a) rp_df_nat = DataFrame(b) rp_df_for.to_csv(savename_for_rp) rp_df_nat.to_csv(savename_nat_rp) alt_nat.to_csv(savename_nat)
#It is important to check the description of the dataset we access by using the following codes Datatoread='F-F_Research_Data_Factors_daily' sdate='2017-07-01' edate='2018-06-30' ds_factors = web.DataReader(Datatoread,'famafrench',start=sdate,end=edate) # Taking [0] as extracting 1F-F-Research_Data_Factors_2x3') print('\nKEYS\n{}'.format(ds_factors.keys())) print('DATASET DESCRIPTION \n {}'.format(ds_factors['DESCR'])) #ds_factors[0].head() #copy the right dict for later examination dfFactor = ds_factors[0].copy()/100 #dfFirm = ds_factors[4].copy() #dfFirm['Offical_total'] = dfFirm.apply(lambda x: x.sum(), axis=1) _ff=DataFrame(dfFactor) _ff=_ff.reset_index() #Data processing ###Not necessary your case _ff=DataFrame(dfFactor) _ff=_ff.reset_index() factor='SMB' wfactor='WSMB' _ff=_ff[['Date',factor]] _ff.rename(columns = {'Date':'date'}, inplace = True) #suppose I'm reading from excel: return dataframe(sheet_name use direct int). You need to make change on the file name and path infile='F:\\RA_Fama_French_Factor\\five_factor_model\\SIZE_HML\\Daily_SIZE_HML_TEST072018.xlsx' strlist = infile.split('.') stitle=strlist[0]
def test_stata_doc_examples(self): with tm.ensure_clean() as path: df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.to_stata(path)
#model starttime = datetime.datetime.now() #Caculate time sample_model = KMeans(n_clusters=10).fit(images_train_sample) #K-Means endtime = datetime.datetime.now() #Caculate time scikit_learn_execution_time = (endtime - starttime).seconds print('scikit-learn execution time:', scikit_learn_execution_time) #Caculate time 429s #objective function value cluster = sample_model.labels_ objective_function_value = sample_model.inertia_ #394810072745.4526 print('scikit-learn objective function value:', objective_function_value) #accuracy crosstable_data = {'label': labels_train_sample, 'cluster': list(cluster)} df = DataFrame(crosstable_data) crosstable = pd.crosstab(index=df['label'], columns=df['cluster']) scikit_accuracy = sum(crosstable.max(axis=0)) / sum(crosstable.sum()) #0.22124 print('scikit-learn accuracy:', scikit_accuracy) #PART3 my kmeans ###### ##### #### ### ## # #model starttime_2 = datetime.datetime.now() #Caculate start time cluster_center, cluster_assign = Kmeans(array(images_train_sample), 10)
model.fit(x=X_train, y=y_train, epochs=3, batch_size=128, verbose=2, validation_split=0.1) #预测 y_predict = model.predict(X_test) #转换预测结果 y_predict_label = label2tag(predictions=y_predict, y=y) #统计正确率 Y_test = label2tag(predictions=y_test, y=y) print( sum([y_predict_label[i] == Y_test[i] for i in range(len(y_predict))]) / len(y_predict)) #导入另一个测试集进行预测,并导出结果 filename = 'xiaomi5a.csv' test_data = pd.read_csv(filename) x = test_data['comment'] X_cut = cut_texts(texts=x, need_cut=True, word_len=2, savepath=None) X_seq = text2seq(texts_cut=X_cut, maxlen=maxlen, tokenizer=tokenizer) X_seq = np.array(X_seq) y_predict = model.predict(X_seq) y_predict_label = label2tag(predictions=y_predict, y=y) #Series转成dateframe out_x = x.to_frame(name=None) out_y = DataFrame(y_predict_label) out_x.to_csv('x.csv') out_y.to_csv('y.csv')
if lista_nombres[i][-2] == lista_nombres[i][1]: n2 = "" medico = clases.Medico(n1, n2, ap1, ap2, lista_ruts[i], lista_edad[i], lista_emails[i], lista_numero[i], lista_especialidad[i]) lista_medicos.append(medico) clinica_objeto = clases.Clinica("Clinica de la Salud", "Público", "Avenida Verdadera #123, Rancagua", "", lista_medicos, lista_pacientes) lista_citas = [] cita_vacia = clases.Cita("", "", "", "") cita_csv = pd.read_csv('./datos/Citas.csv') cita_csv = DataFrame(cita_csv) codigo = cita_csv["codigo"].values rut_paciente = cita_csv["rut paciente"].values rut_medico = cita_csv["rut medico"].values fecha_citada = cita_csv["fecha citada"].values fecha_creacion = cita_csv["fecha de creacion"].values modalidad = cita_csv["modalidad"].values prestacion = cita_csv["prestacion"].values confirmada = cita_csv["confirmada"].values tiempo_restante = cita_csv["tiempo restante"].values for i in range(len(codigo)): cita_vacia.setCodigo(codigo[i]) cita_vacia.setPaciente(clinica_objeto.buscarPaciente(rut_paciente[i])[0]) cita_vacia.setMedico(clinica_objeto.buscarMedico(rut_medico[i])[0]) cita_vacia.setFechaCitada(parser.parse(fecha_citada[i]))
def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, true_values=None, false_values=None, verbose=False, dtype=None, squeeze=False, **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter _validate_header_arg(header) if has_index_names is not None: warn( "\nThe has_index_names argument is deprecated; index names " "will be automatically inferred based on index_col.\n" "This argmument is still necessary if reading Excel output " "from 0.16.2 or prior with index names.", FutureWarning, stacklevel=3) if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") if parse_dates is True and index_col is None: warn("The 'parse_dates=True' keyword of read_excel was provided" " without an 'index_col' keyword value.") def _parse_cell(cell_contents, cell_typ): """converts the contents of the cell into a pandas appropriate object""" if cell_typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. try: cell_contents = \ xldate.xldate_as_datetime(cell_contents, epoch1904) except OverflowError: return cell_contents # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): cell_contents = time(cell_contents.hour, cell_contents.minute, cell_contents.second, cell_contents.microsecond) else: # Use the xlrd <= 0.9.2 date handling. try: dt = xldate.xldate_as_tuple(cell_contents, epoch1904) except xldate.XLDateTooLarge: return cell_contents if dt[0] < MINYEAR: cell_contents = time(*dt[3:]) else: cell_contents = datetime(*dt) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan elif cell_typ == XL_CELL_BOOLEAN: cell_contents = bool(cell_contents) elif convert_float and cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(cell_contents) if val == cell_contents: cell_contents = val return cell_contents ret_dict = False if isinstance(sheetname, list): sheets = sheetname ret_dict = True elif sheetname is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheetname] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False # Keep sheetname to maintain backwards compatibility. for asheetname in sheets: if verbose: print("Reading sheet %s" % asheetname) if isinstance(asheetname, compat.string_types): sheet = self.book.sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(asheetname) data = [] should_parse = {} if sheet.nrows > 5000: raise Exception( "The raw file contains more than 5000 rows. Please check if it is correct or split the files (max: 5000 rows) for upload" ) elif kwds.get('MaxTest'): continue for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate( zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: row.append(_parse_cell(value, typ)) data.append(row) # output[asheetname] = data if sheet.nrows == 0: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None: if is_list_like(header): header_names = [] control_row = [True for x in data[0]] for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header( data[row], control_row) header_name, data[row] = _pop_header_name( data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # forward fill values for MultiIndex index if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last else: last = data[row][col] if is_list_like(header) and len(header) > 1: has_index_names = True if kwds.get('parsed'): try: parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, true_values=true_values, false_values=false_values, skiprows=skiprows, skipfooter=skip_footer, squeeze=squeeze, dtype=dtype, **kwds) output[asheetname] = parser.read() if names is not None: output[asheetname].columns = names if not squeeze or isinstance(output[asheetname], DataFrame): output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() else: output[asheetname] = data if ret_dict or kwds.get('MaxTest'): return output else: return output[asheetname]
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False, dtype=None, ): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels] index: Optional[Index] if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: Union[bool, float, int] if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: List[List] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def makeNewsDataCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, word_trend_file=None, news_file=None, output_file=None, stock_id=None): if cur == None or start_date == None or end_date == None or word_trend_file is None or output_file == None or stock_id == None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) news_path = os.path.join(basic_path, news_file) word_trend_path = os.path.join(basic_path, word_trend_file) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) columns = [ "stock_id", "date", "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "rate" ] + ["news_pos_num", "news_neg_num"] data = {} for k in columns: data[k] = [] pd.DataFrame(data).to_csv(output_path, index=False, columns=columns) word_trend = {} word_trend_temp = pd.read_csv(word_trend_path) for k in word_trend_temp["0"].keys(): word_trend[word_trend_temp["0"][k]] = [ word_trend_temp["1"][k], word_trend_temp["2"][k] ] p_up = word_trend['total_words'][0] / (word_trend['total_words'][0] + word_trend['total_words'][1]) p_down = word_trend['total_words'][1] / (word_trend['total_words'][0] + word_trend['total_words'][1]) cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] skip = 100 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - 1 < 0 else slimit - 1, skip if slimit - 1 < 0 else skip + 1)) slimit += skip history_tt = cur.fetchall() history_t = [] for h in history_tt: history_t.append([ int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9]) ]) del history_tt history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'opening': history_temp[1], 'closing': history_temp[2], 'difference': history_temp[3], 'percentage_difference': history_temp[4], 'lowest': history_temp[5], 'highest': history_temp[6], 'volume': history_temp[7], 'amount': history_temp[8], 'date': history_temp[9] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) ''' ''' sdate = str(history['date'][history['date'].keys()[0]]) edate = str(history['date'][history['date'].keys()[-1]]) # sdate = datetime.datetime.strptime(sdate,'%Y-%m-%d') # sdate = (sdate - datetime.timedelta(days=0)).strftime('%Y-%m-%d') cur.execute( "SELECT GROUP_CONCAT(id SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' group by time" % (sdate, edate)) news_temp = cur.fetchall() news_by_date = {} news_by_id = {} for n in news_temp: news_by_date[str(n[1])] = n[0].split(",") for nid in news_by_date[str(n[1])]: news_by_id[nid] = None del news_temp nid_len = len(news_by_id) reader = pd.read_csv(news_path, chunksize=1000) for sentences in reader: if nid_len > 0: for k in sentences['1'].keys(): nid = str(sentences['0'][k]) if nid in news_by_id and news_by_id[nid] == None: news_by_id[nid] = str(sentences['1'][k]).split(" ") wp_up = p_up wp_down = p_down for w in news_by_id[nid]: if w not in word_trend: wp_up *= (1 / word_trend['total_words'][0]) wp_down *= (1 / word_trend['total_words'][1]) else: if word_trend[w][0] > 0: wp_up *= word_trend[w][0] else: wp_up *= (1 / word_trend['total_words'][0]) if word_trend[w][1] > 0: wp_down *= word_trend[w][1] else: wp_down *= ( 1 / word_trend['total_words'][1]) while True: if wp_up < 1 and wp_down < 1: wp_up *= 10 wp_down *= 10 else: break news_by_id[nid] = [ wp_up / (wp_up + wp_down), -1 * wp_down / (wp_up + wp_down) ] nid_len -= 1 if nid_len <= 0: break else: break reader.close() del reader, sentences for d in news_by_date: sumn = [0, 0] for nid in news_by_date[d]: sumn[0] += news_by_id[nid][0] sumn[1] += news_by_id[nid][1] le = len(news_by_date[d]) if le > 0: sumn[0] /= le sumn[1] /= le news_by_date[d] = sumn print(d) history['news_pos_num'] = 0 history['news_neg_num'] = 0 for i in history.index: history.loc[i, 'rate'] = str( np.round(float(history['rate'][i]), 2)) if str(history['date'][i]) in news_by_date: history.loc[i, 'news_pos_num'] = str( np.round( float(news_by_date[str(history['date'][i])][0]), 2)) history.loc[i, 'news_neg_num'] = str( np.round( float(news_by_date[str(history['date'][i])][1]), 2)) else: history.loc[i, 'news_pos_num'] = "0" history.loc[i, 'news_neg_num'] = "0" #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_stock.name) data = {} for k in columns: data[k] = [] for i in range(len(data_stock) - 1): for k in data: data[k].append(data_stock.iloc[i][k]) pd.DataFrame(data).to_csv(output_path, index=False, header=False, mode="a", columns=columns) g_stock = history.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock.apply(func_train_data)
def pivot_annual(series, freq=None): """ Deprecated. Use ``pivot_table`` instead. Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, and as many columns as the length of a leap year in the units corresponding to the original frequency (366 for daily frequency, 366*24 for hourly...). The first column of the output corresponds to Jan. 1st, 00:00:00, while the last column corresponds to Dec, 31st, 23:59:59. Entries corresponding to Feb. 29th are masked for non-leap years. For example, if the initial series has a daily frequency, the 59th column of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st, and the 60th column is masked for non-leap years. With a hourly initial frequency, the (59*24)th column of the output always correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and the 24 columns between (59*24) and (61*24) are masked. If the original frequency is less than daily, the output is equivalent to ``series.convert('A', func=None)``. Parameters ---------- series : Series freq : string or None, default None Returns ------- annual : DataFrame """ msg = "pivot_annual is deprecated. Use pivot_table instead" warnings.warn(msg, FutureWarning) index = series.index year = index.year years = algorithms.unique1d(year) if freq is not None: freq = freq.upper() else: freq = series.index.freq if freq == 'D': width = 366 offset = np.asarray(index.dayofyear) - 1 # adjust for leap year offset[(~isleapyear(year)) & (offset >= 59)] += 1 columns = lrange(1, 367) # todo: strings like 1/1, 1/25, etc.? elif freq in ('M', 'BM'): width = 12 offset = np.asarray(index.month) - 1 columns = lrange(1, 13) elif freq == 'H': width = 8784 grouped = series.groupby(series.index.year) defaulted = grouped.apply(lambda x: x.reset_index(drop=True)) defaulted.index = defaulted.index.droplevel(0) offset = np.asarray(defaulted.index) offset[~isleapyear(year) & (offset >= 1416)] += 24 columns = lrange(1, 8785) else: raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) values.put(flat_index, series.values) return DataFrame(values, index=years, columns=columns)
target = cl.astype('int') print (target) # 切分訓練與測試資料 train_X, test_X, train_y, test_y = train_test_split(data, target, train_size = 0.9, random_state = 42) print (train_y) # 建立分類器 clf = neighbors.KNeighborsClassifier(n_neighbors = 25) data_clf = clf.fit(train_X, train_y) # 預測 test_y_predicted = data_clf.predict(test_X) """print (test_y_predicted) # 標準答案 print (test_y)""" # 績效 accuracy = metrics.accuracy_score(test_y, test_y_predicted) print "accuracy : ", accuracy precision = metrics.precision_score(test_y, test_y_predicted, average='macro') print "precision : ", precision recall = metrics.recall_score(test_y, test_y_predicted, average='macro') print "recall : ", recall f_measure = 2 * (precision * recall / (precision + recall)) print "f_measure : ", f_measure output = {'click' : test_y_predicted} output = DataFrame(output) output.to_csv('output.csv', sep=',', index = 0)
def on_data(context: Context): if datetime.datetime.strftime(context.now, '%Y-%m-%d') not in context.month_begin: return # 获取沪深300指数数据 price = get_reg_kdata(reg_idx=context.reg_kdata[0], length=1, fill_up=True, df=True) index = get_reg_kdata(reg_idx=context.reg_kdata[0], target_indices=300, length=context.long + context.Len - 1, fill_up=False, df=True) factor = get_reg_factor(reg_idx=context.reg_factor[0], target_indices=(), length=5, df=True) if price['close'].isna().any(): return """ 计算沪深300指数的长短期波动率,以长期波动率为门限,若短期波动率突破, 则降低股票池持仓为50% """ index['ret'] = index.groupby('target_idx')['close'].apply( lambda x: (x - x.shift()) / x.shift()) index = index.fillna(0) # 将NaN换为0 ret = index.ret.values.astype(float) StdDev = talib.STDDEV(ret, timeperiod=context.Len, nbdev=1) StdDev = DataFrame({"a": StdDev}) StdDev = StdDev.dropna() std = StdDev['a'].tolist() std_short = np.mean(std[-14:]) bound = np.mean(std) # factor的注册频率默认为日频 factor = factor.dropna(subset=['date']) # 删除非法日期 factor['code'] = factor['target_idx'].apply( lambda x: context.target_list[x]) # 将用0,1,2,3等表示的股票换成对应的股票代码 factor['month'] = factor['date'].apply(lambda x: int( str(x)[0:4] + str(x)[5:7])) # 增加month列,2017-01,2017-02,只记录月份,不记录日时分秒 factor_name = factor['factor'].drop_duplicates().tolist() # 以列表的形式取出因子名称 # 将factor按['target_idx','month','factor']分组,分别取每组的最后一行 # 即取出各股票每个月末的所有因子值 factor_month = factor.groupby( ['target_idx', 'month', 'factor']).apply(lambda x: x.iloc[-1])[['date', 'value']].reset_index() # 添加所有因子名作为新的列 factor_month1 = factor_month.groupby(['target_idx', 'month']).apply(deal).reset_index() """ 取最后一个月(当前时间) """ test = factor_month1.groupby('target_idx').apply(lambda x: x.iloc[-1]) scaler = StandardScaler() # 标准化 X_test = test[factor_name] X_test = X_test.fillna(0).values #X_test=scaler.fit_transform(X_test) # 因子标准化 X_test = scaler.fit_transform(X_test) # 因子标准化 # 预测 model = pickle.load(open("XGboost_ret0.06_5factor.pickle.dat", "rb")) y_pred = model.predict(X_test) y_pred1 = pd.DataFrame(y_pred, columns=['label']) idx_list = list(y_pred1[y_pred1['label'] == 1].index) print(idx_list) positions = context.account().positions # 根据波动率进行风险控制 if std_short > bound: for target_idx in positions.loc[positions['volume_long'] > 0, 'target_idx'].astype(int): if target_idx == 300: pass else: volume = positions['volume_long'].iloc[target_idx] order_volume(account_idx=0, target_idx=target_idx, volume=int(volume * 0.5), side=2, position_effect=2, order_type=2, price=0) if len(idx_list) == 0: # 没有一只股票在标的池,则卖出全部股票 for target_idx in positions.loc[positions['volume_long'] > 0, 'target_idx'].astype(int): if target_idx == 300: pass else: volume = positions['volume_long'].iloc[target_idx] order_volume(account_idx=0, target_idx=target_idx, volume=int(volume), side=2, position_effect=2, order_type=2, price=0) else: # 平不在标的池的股票 for target_idx in positions.target_idx.astype(int): if target_idx not in idx_list: if positions['volume_long'].iloc[target_idx] > 0: volume = positions['volume_long'].iloc[target_idx] order_volume(account_idx=0, target_idx=target_idx, volume=int(volume), side=2, position_effect=2, order_type=2, price=0) # 获取股票的权重 percent_b = context.ratio / len(idx_list) # print(percent_b) # 买在标的池中的股票 for target_idx in idx_list: if target_idx == 300: pass else: order_target_percent(account_idx=0, target_idx=target_idx, target_percent=percent_b, side=1, order_type=2) print(positions.loc[positions['volume_long'] > 0, 'code'].tolist())
def predict_role(ps): fd = pd.read_csv('player_label.csv') df_obj = fd.label fd.label = df_obj.apply(lambda x: str(x).strip()) print(fd.label) test_set = fd[['label']] train_set = fd[[ 'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes' ]] train_set = train_set[1:] test_set = test_set[1:] from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(train_set, test_set, test_size=0.33, random_state=12) from sklearn.naive_bayes import MultinomialNB clf_NB = MultinomialNB().fit(x_train, y_train) predicted = clf_NB.predict(x_test) import numpy as np from sklearn import metrics print("#################### NB ######################") confusion_matrix_NB = metrics.confusion_matrix(y_test, predicted) print(confusion_matrix_NB) accuracy_NB = metrics.accuracy_score(y_test, predicted) print(accuracy_NB) # print(metrics.classification_report(y_test, predicted)) print("##############################################") from sklearn import tree clf_tree = tree.DecisionTreeClassifier().fit(x_train, y_train) predicted = clf_tree.predict(x_test) print("#################### Decision Tree ######################") print(metrics.confusion_matrix(y_test, predicted)) accuracy_DT = metrics.accuracy_score(y_test, predicted) print(accuracy_DT) # print(metrics.classification_report(y_test, predicted)) print("##############################################") from sklearn.linear_model import SGDClassifier clf_SGD = SGDClassifier().fit(x_train, y_train) predicted = clf_SGD.predict(x_test) print("#################### SGD Classifier ######################") print(metrics.confusion_matrix(y_test, predicted)) accuracy_SGD = metrics.accuracy_score(y_test, predicted) print(accuracy_SGD) print("##############################################") from pandas.core.frame import DataFrame predict_data = DataFrame(ps) print( "----------------------&&&&&&&&&&&&&&&&&&&&&&&&&&&&&-----------------------" ) print(predict_data) print( "----------------------&&&&&&&&&&&&&&&&&&&&&&&&&&&&&-----------------------" ) predict_data = predict_data.iloc[:, 7:] print( "---------------------- become 38 ----------------------------------------" ) print(predict_data) print( "---------------------- become 38 ----------------------------------------" ) accuracy_list = [accuracy_NB, accuracy_DT, accuracy_SGD] if max(accuracy_list) == accuracy_NB: clf_model = clf_NB elif max(accuracy_list) == accuracy_DT: clf_model = clf_tree elif max(accuracy_list) == accuracy_SGD: clf_model = clf_SGD predicted = clf_model.predict(predict_data) print("************* model selection ****************") print(clf_model) pd.value_counts(predicted) print(predicted) print(type(predicted)) return predicted.tolist()
def recommend(self, userID:int, portFolioModel:DataFrame, argumentsDict:Dict[str,object]): if type(userID) is not int and type(userID) is not np.int64: raise ValueError("Argument userID isn't type int.") if type(portFolioModel) is not DataFrame: raise ValueError("Argument portFolioModel isn't type DataFrame.") if type(argumentsDict) is not dict: raise ValueError("Argument argumentsDict isn't type dict.") numberOfItems:int = argumentsDict[self.ARG_NUMBER_OF_AGGR_ITEMS] recomItemIDsWithRspR1Ser:Series = self._recommender.recommend(userID, numberOfItems=numberOfItems, argumentsDict=argumentsDict) recomItemIDsAggr1:List[int] recomItemIDsWithRspAggr1:Series recomItemIDsAggr1, recomItemIDsWithRspAggr1 = self._portfolio1Aggr.recommend(userID, portFolioModel, argumentsDict=argumentsDict) print(recomItemIDsWithRspAggr1) aggrBanditsResp = countAggrBanditsResponsibility(recomItemIDsWithRspAggr1, portFolioModel) #aggrBanditsResp = countAggrDHondtResponsibility(dict(recomItemIDsWithRspAggr1), portFolioModel) aggrBanditsRespSer:Series = Series(dict(aggrBanditsResp)) recomItemIDsNegativeSer:Series = Series(self._penaltyTool.getPenaltiesOfItemIDs(userID, self._history)) if len(recomItemIDsNegativeSer) > 0: finalNegScores = normalize(np.expand_dims(recomItemIDsNegativeSer.values, axis=0))[0, :] recomItemIDsNegativeSer:Series = Series(finalNegScores.tolist(), index=recomItemIDsNegativeSer.index) #print(a) #recomItemIDsNegative = normalize(np.expand_dims(recomItemIDsNegative, axis=0))[0, :] inputItemIDsDict:dict = {"input1":recomItemIDsWithRspR1Ser, "input2":aggrBanditsRespSer, "negative":recomItemIDsNegativeSer} aggItemIDsWithRelevanceSer:Series = self._aggrHier.runWithResponsibility(inputItemIDsDict, DataFrame(), userID, numberOfItems, argumentsDict) aggItemIDs:List[int] = list(aggItemIDsWithRelevanceSer.index) aggItemIDsWithRelevance:List = [(itemI, dict(recomItemIDsWithRspAggr1).get(itemI, {})) for itemI in aggItemIDs] return (aggItemIDs, aggItemIDsWithRelevance)
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): mgr = self._init_spmatrix(data, index, columns, dtype=dtype, fill_value=default_fill_value) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, Series): mgr = self._init_dict(data.to_frame(), data.index, columns=None, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = DataFrame() if index is None: index = Index([]) else: index = ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) else: msg = ('SparseDataFrame called with unknown type "{data_type}" ' 'for data argument') raise TypeError(msg.format(data_type=type(data).__name__)) generic.NDFrame.__init__(self, mgr)
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError('skip_footer not supported for iteration') try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) # no index column specified, so infer that's what is wanted if self.index_col is not None: if np.isscalar(self.index_col): index = zipped_content.pop(self.index_col) else: # given a list of index index = [] for idx in self.index_col: index.append(zipped_content[idx]) # remove index items from content and columns, don't pop in loop for i in reversed(sorted(self.index_col)): zipped_content.pop(i) if np.isscalar(self.index_col): if self.parse_dates: index = lib.try_parse_dates(index, parser=self.date_parser) index = Index(_convert_types(index, self.na_values), name=self.index_name) else: arrays = [] for arr in index: if self.parse_dates: arr = lib.try_parse_dates(arr, parser=self.date_parser) arrays.append(_convert_types(arr, self.na_values)) index = MultiIndex.from_arrays(arrays, names=self.index_name) else: index = Index(np.arange(len(content))) if not index._verify_integrity(): dups = index.get_duplicates() raise Exception('Index has duplicates: %s' % str(dups)) if len(self.columns) != len(zipped_content): raise Exception('wrong number of columns') data = dict((k, v) for k, v in zip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = np.vectorize(f)(data[col]) data = _convert_to_ndarrays(data, self.na_values) return DataFrame(data=data, columns=self.columns, index=index)
def granger_causality(self): """Returns the f-stats and p-values from the Granger Causality Test. If the data consists of columns x1, x2, x3, then we perform the following regressions: x1 ~ L(x2, x3) x1 ~ L(x1, x3) x1 ~ L(x1, x2) The f-stats of these results are placed in the 'x1' column of the returned DataFrame. We then repeat for x2, x3. Returns ------- Dict, where 'f-stat' returns the DataFrame containing the f-stats, and 'p-value' returns the DataFrame containing the corresponding p-values of the f-stats. """ from pandas.stats.api import ols from scipy.stats import f d = {} for col in self._columns: d[col] = {} for i in xrange(1, 1 + self._p): lagged_data = self._lagged_data[i].filter(self._columns - [col]) for key, value in lagged_data.iteritems(): d[col][_make_param_name(i, key)] = value f_stat_dict = {} p_value_dict = {} for col, y in self._data.iteritems(): ssr_full = (self.resid[col]**2).sum() f_stats = [] p_values = [] for col2 in self._columns: result = ols(y=y, x=d[col2]) resid = result.resid ssr_reduced = (resid**2).sum() M = self._p N = self._nobs K = self._k * self._p + 1 f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K)) f_stats.append(f_stat) p_value = f.sf(f_stat, M, N - K) p_values.append(p_value) f_stat_dict[col] = Series(f_stats, self._columns) p_value_dict[col] = Series(p_values, self._columns) f_stat_mat = DataFrame(f_stat_dict) p_value_mat = DataFrame(p_value_dict) return { 'f-stat': f_stat_mat, 'p-value': p_value_mat, }
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [ u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels ] dummy_cols = [ dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels) ] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if isinstance(data, dict): mgr = self._init_dict(data, index, columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns) if dtype is not None: mgr = mgr.astype(dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = DataFrame() if index is None: index = Index([]) else: index = _ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) NDFrame.__init__(self, mgr)
def combine_rows(self, to_combined_df): tmp_df = DataFrame() tmp_df['User'] = to_combined_df['User'].values[:1] tmp_df['Date'] = to_combined_df['Date'].values[:1] tmp_df['Day'] = to_combined_df['Day'].values[:1] tmp_df['MULT'] = to_combined_df['MULT'].values[:1] napn = to_combined_df['NAPN'].fillna('').values if napn[0] != '': tmp_df['NAPN'] = napn[0] else: tmp_df['NAPN'] = napn[1] napt = to_combined_df['NAPT'].fillna('').values if napt[0] != '': tmp_df['NAPT'] = napt[0] else: tmp_df['NAPT'] = napt[1] aln = to_combined_df['ALN'].fillna('').values if aln[0] != '': tmp_df['ALN'] = aln[0] else: tmp_df['ALN'] = aln[1] alt = to_combined_df['ALT'].fillna('').values if alt[0] != '': tmp_df['ALT'] = alt[0] else: tmp_df['ALT'] = alt[1] cafn = to_combined_df['CAFN'].fillna('').values if cafn[0] != '': tmp_df['CAFN'] = cafn[0] else: tmp_df['CAFN'] = cafn[1] caft = to_combined_df['CAFT'].fillna('').values if caft[0] != '': tmp_df['CAFT'] = caft[0] else: tmp_df['CAFT'] = caft[1] smed = to_combined_df['SMED'].fillna('').values if smed[0] != '': tmp_df['SMED'] = smed[0] else: tmp_df['SMED'] = smed[1] smed1 = to_combined_df['SMED1'].fillna('').values if smed1[0] != '': tmp_df['SMED1'] = smed1[0] else: tmp_df['SMED1'] = smed1[1] smed1t = to_combined_df['SMED1T'].fillna('').values if smed1t[0] != '': tmp_df['SMED1T'] = smed1t[0] else: tmp_df['SMED1T'] = smed1t[1] smed2 = to_combined_df['SMED2'].fillna('').values if smed2[0] != '': tmp_df['SMED2'] = smed2[0] else: tmp_df['SMED2'] = smed2[1] smed2t = to_combined_df['SMED2T'].fillna('').values if smed2t[0] != '': tmp_df['SMED2T'] = smed2t[0] else: tmp_df['SMED2T'] = smed2t[1] smed3 = to_combined_df['SMED3'].fillna('').values if smed3[0] != '': tmp_df['SMED3'] = smed3[0] else: tmp_df['SMED3'] = smed3[1] smed3t = to_combined_df['SMED3T'].fillna('').values if smed3t[0] != '': tmp_df['SMED3T'] = smed3t[0] else: tmp_df['SMED3T'] = smed3t[1] notebb = to_combined_df['NOTEBB'].fillna('').values if notebb[0] != '': tmp_df['NOTEBB'] = notebb[0] else: tmp_df['NOTEBB'] = notebb[1] attempt = to_combined_df['ATTEMPT'].fillna('').values if attempt[0] != '': tmp_df['ATTEMPT'] = attempt[0] else: tmp_df['ATTEMPT'] = attempt[1] bt = to_combined_df['BT'].fillna('').values if bt[0] != '': tmp_df['BT'] = bt[0] else: tmp_df['BT'] = bt[1] lo = to_combined_df['LO'].fillna('').values if lo[0] != '': tmp_df['LO'] = lo[0] else: tmp_df['LO'] = lo[1] wt = to_combined_df['WT'].fillna('').values if wt[0] != '': tmp_df['WT'] = wt[0] else: tmp_df['WT'] = wt[1] rt = to_combined_df['RT'].fillna('').values if rt[0] != '': tmp_df['RT'] = rt[0] else: tmp_df['RT'] = rt[1] sol = to_combined_df['SOL'].fillna('').values if sol[0] != '': tmp_df['SOL'] = sol[0] else: tmp_df['SOL'] = sol[1] snz = to_combined_df['SNZ'].fillna('').values if snz[0] != '': tmp_df['SNZ'] = snz[0] else: tmp_df['SNZ'] = snz[1] tst = to_combined_df['TST'].fillna('').values if tst[0] != '': tmp_df['TST'] = tst[0] else: tmp_df['TST'] = tst[1] wason = to_combined_df['WASON'].fillna('').values if wason[0] != '': tmp_df['WASON'] = wason[0] else: tmp_df['WASON'] = wason[1] wasot = to_combined_df['WASOT'].fillna('').values if wasot[0] != '': tmp_df['WASOT'] = wasot[0] else: tmp_df['WASOT'] = wasot[1] ea = to_combined_df['EA'].fillna('').values if ea[0] != '': tmp_df['EA'] = ea[0] else: tmp_df['EA'] = ea[1] eat = to_combined_df['EAT'].fillna('').values if eat[0] != '': tmp_df['EAT'] = eat[0] else: tmp_df['EAT'] = eat[1] sq = to_combined_df['SQ'].fillna('').values if sq[0] != '': tmp_df['SQ'] = sq[0] else: tmp_df['SQ'] = sq[1] rest = to_combined_df['REST'].fillna('').values if rest[0] != '': tmp_df['REST'] = rest[0] else: tmp_df['REST'] = rest[1] notewu = to_combined_df['NOTEWU'].fillna('').values if notewu[0] != '': tmp_df['NOTEWU'] = notewu[0] else: tmp_df['NOTEWU'] = notewu[1] tib = to_combined_df['TIB'].fillna('').values if tib[0] != '': tmp_df['TIB'] = tib[0] else: tmp_df['TIB'] = tib[1] se1 = to_combined_df['SE1'].fillna('').values if se1[0] != '': tmp_df['SE1'] = se1[0] else: tmp_df['SE1'] = se1[1] se2 = to_combined_df['SE2'].fillna('').values if se2[0] != '': tmp_df['SE2'] = se2[0] else: tmp_df['SE2'] = se2[1] return tmp_df
def _unstack_vector(self, vec, index=None): if index is None: index = self._y_trans.index panel = DataFrame(vec, index=index, columns=['dummy']) return panel.to_panel()['dummy']
def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index)
def __init__(self, records, columns): self.dataframe = DataFrame(records, columns=columns)
''' from pandas.core.frame import DataFrame from printheader import print_header cols = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] index = ['a', 'b', 'c', 'd', 'e', 'f'] values = [ [100, 110, 120, 130, 140], [200, 210, 220, 230, 240], [300, 310, 320, 330, 340], [400, 410, 420, 430, 440], [500, 510, 520, 530, 540], [600, 610, 620, 630, 640], ] print_header('values:') print(values, '\n\n') df = DataFrame(values, index=index, columns=cols) print_header('DataFrame df') print(df, '\n') df2 = df.drop(['beta', 'delta'], axis=1) print_header("After dropping beta and delta:") print(df2, '\n') print_header("After dropping rows b, c, and e") dfx = df.drop(['b', 'c', 'e'], inplace=True) print(df) print(df['a':'d']) print(dfx)
def makeBindexDataCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, word_count=20, stock_id=None, ranking_type='tfidf'): if cur == None or start_date == None or end_date == None or output_file == None or stock_id == None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if word_count < 0: word_count = 20 if ranking_type not in ["tfidf", "textrank"]: ranking_type = "tfidf" output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) words = cls.getImportVocab(cur, count=20, ranking_type=ranking_type) word_count = len(words) for i in range(len(words)): words[i] = "'" + words[i] + "'" words_str = ",".join(words) del words word_key_list = [] for i in range(1, word_count + 1): word_key_list.append("word%s" % i) columns = [ "stock_id", "date", "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "rate" ] + word_key_list data = {} for k in columns: data[k] = [] pd.DataFrame(data).to_csv(output_path, index=False, columns=columns) cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] skip = 50 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - 1 < 0 else slimit - 1, skip if slimit - 1 < 0 else skip + 1)) slimit += skip history_tt = cur.fetchall() history_t = [] for h in history_tt: history_t.append([ int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9]) ]) del history_tt sdate = str(history_t[0][9]) edate = str(history_t[-1][9]) sdate = datetime.datetime.strptime(sdate, '%Y-%m-%d') sdate = (sdate - datetime.timedelta(days=1)).strftime('%Y-%m-%d') cur.execute( "SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc" % (words_str, sdate, edate)) bindex = cur.fetchall() bindex_t = [] bindex_vec = 0 cur_date = None if len(bindex) > 0: cur_date = str(bindex[0][2]) bix = [] bix_item = [cur_date] if len(bindex) > 0: for bi in bindex: if str(bi[2]) != cur_date: cur_date = str(bi[2]) bix.append(bix_item) bix_item = [cur_date] bix_temp = json.loads(bi[1]) bix_item.append(bix_temp['all']['0']) bix.append(bix_item) del bindex bindex = {} for k in range(1, len(bix)): b_t = [] for kk in range(1, len(bix[k])): if int(bix[k][kk]) != 0 and int(bix[k - 1][kk]) != 0: b_t.append( str( np.round( float(100 * (int(bix[k][kk]) / int(bix[k - 1][kk]) - 1)), 2))) else: b_t.append(str(0.01)) bindex[bix[k][0]] = b_t del bix for i in range(len(history_t)): history_t[i] += bindex[history_t[i][9]] history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'opening': history_temp[1], 'closing': history_temp[2], 'difference': history_temp[3], 'percentage_difference': history_temp[4], 'lowest': history_temp[5], 'highest': history_temp[6], 'volume': history_temp[7], 'amount': history_temp[8], 'date': history_temp[9] } for i in range(10, 10 + word_count): history["word%s" % (i - 9)] = history_temp[i] del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) for i in history.index: history.loc[i, 'rate'] = str( np.round(float(history['rate'][i]), 2)) #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_stock.name) data = {} for k in columns: data[k] = [] for i in range(len(data_stock) - 1): for k in data: data[k].append(data_stock.iloc[i][k]) pd.DataFrame(data).to_csv(output_path, index=False, header=False, mode="a", columns=columns) g_stock = history.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock.apply(func_train_data)