def get_data_frame(self): data = DataFrame(self.data) data.columns = ['学校', '考试方式', '院系所', '', '专业', '学习方式', '研究方向', '指导教师', '拟招生人数', '备注'] data.drop(labels='', axis=1, inplace=True) data.to_csv(self.provinceName + "研究生招生信息.csv", encoding="utf_8_sig", index=False)
def process_data(data): drop_list = [] #the list of lines we need to drop new_zip = [] new_AMT = [] new_DT = [] #make a dataframe header=['CMTE_ID','AMNDT_IND','RPT_TP','TRANSACTION_PGI','IMAGE_NUM','TRANSACTION_TP',\ 'ENTITY_TP','NAME','CITY','STATE','ZIP_CODE','EMPLOYER','OCCUPATION','TRANSACTION_DT',\ 'TRANSACTION_AMT','OTHER_ID','TRAN_ID','FILE_NUM','MEMO_CD','MEMO_TEXT','SUB_ID'] df = DataFrame(data, columns=header) #make new dataframe with the information we need df = df[[ 'CMTE_ID', 'NAME', 'ZIP_CODE', 'TRANSACTION_DT', 'TRANSACTION_AMT', 'OTHER_ID' ]] #transfer string in TRANSACTION_AMT into int for i in df['TRANSACTION_AMT']: new_AMT.append(int(i)) df['TRANSACTION_AMT'] = new_AMT #remove invalid data for i in list(df.index.values): if df.loc[i]['CMTE_ID']=='' or validate_name(df.loc[i]['NAME'])==False \ or (len(df.loc[i]['ZIP_CODE'])>=5)==False or validate_date(df.loc[i]['TRANSACTION_DT'])!=True \ or df.loc[i]['TRANSACTION_AMT']=='' or df.loc[i]['OTHER_ID']!='': drop_list.append(i) else: new_zip.append(df.loc[i]['ZIP_CODE'][0:5]) new_DT.append(int(df.loc[i]['TRANSACTION_DT'][4:])) #remove invalid line in drop list df.drop(drop_list, inplace=True) #Add number of df['ZIP_CODE'] = new_zip df['TRANSACTION_DT'] = new_DT return df
def condense_heatmap(df_input: DataFrame, new_order: List[str]) -> DataFrame: """ Converts the np.array with stored enrichment scores into the condensed heatmap """ df_input = df_input.copy() df_input.drop(['Position'], axis=1, inplace=True) # Group by sequence and aminoacid, and then pivot table df_grouped = df_input.groupby(['Sequence', 'Aminoacid'], sort=False).mean() df_pivoted = df_grouped.pivot_table(values='Score', index='Aminoacid', columns='Sequence') df_pivoted.reset_index(drop=False, inplace=True) # Sort in y axis desired order df_pivoted['Aminoacid'] = Categorical(df_pivoted['Aminoacid'], new_order) df_pivoted = df_pivoted.sort_values(by=['Aminoacid']) # Sort in x axis desired order x_order = return_common_elements(new_order, list(df_pivoted.columns)) # Drop amino acid column data_dropped = df_pivoted.drop(['Aminoacid'], axis=1) return data_dropped[x_order]
def _drop_single_point(move_data: DataFrame, label_new_tid: str, label_id: str): """ Removes trajectory with single point. Parameters ---------- move_data: dataframe dataframe with trajectories label_new_tid : str The label of the column containing the ids of the formed segments. Is the new splitted id. label_id : str Indicates the label of the id column in the user dataframe, by default TRAJ_ID """ shape_before_drop = move_data.shape idx = move_data[move_data[label_new_tid] == -1].index if idx.shape[0] > 0: logger.debug('...Drop Trajectory with a unique GPS point\n') ids_before_drop = move_data[label_id].unique().shape[0] move_data.drop(index=idx, inplace=True) logger.debug('...Object - before drop: {} - after drop: {}'.format( ids_before_drop, move_data[label_id].unique().shape[0])) logger.debug('...Shape - before drop: {} - after drop: {}'.format( shape_before_drop, move_data.shape)) else: logger.debug('...No trajectories with only one point.')
def deleteHeadToTrain(data: DataFrame, rows: int): data = data.drop(range(0, rows), axis=0) for x in range(len(data.index)): if data['HM'].values[x] < 3 or data['P1'].values[x] < 4 or data[ 'AM'].values[x] < 3 or data['P2'].values[x] < 4: data.drop(x, axis=0) return data
def ListoDF(data): if isinstance(data, list): Df = DataFrame(data) # 转为数据框 Df.columns = Df.iloc[0, :] # 修改列名 Df.drop(0, axis=0, inplace=True) # 删除第一行 else: Df = data return Df
def _sort_yaxis_aminoacids(df_input: DataFrame, neworder_aminoacids: list, old_order: list) -> DataFrame: # Sort in y axis desired order df_input['Aminoacid_new'] = old_order df_input['Aminoacid_new'] = Categorical(df_input['Aminoacid_new'], neworder_aminoacids) df_input.sort_values(by=['Aminoacid_new'], inplace=True) df_input.drop(['Aminoacid_new'], inplace=True, axis=1) return df_input
def find_matches(self, df: DataFrame, currentIdx: int, indexes: list, currentPattern): if not (str(currentPattern) in df.columns): df[str(currentPattern)] = 0 for index in indexes: amount = df.at[index, 'totalcount'] df.at[currentIdx, 'totalcount'] += amount df.at[currentIdx, str(currentPattern)] += amount df.drop(index, inplace=True) return df
def process_dataframe(df: DataFrame) -> DataFrame: df = df.drop(index=[0, 1, 4, len(df) - 1]) df_member_df = df.loc[3] df_member_df = df_member_df.dropna() df_member_list = list() for i in list(df_member_df): if '\n' in i: temp_text = i.split('\n') if temp_text[0]: df_member_list.append(temp_text[0]) df_column_pre = df.loc[2] df_column_pre = df_column_pre.drop(df_column_pre.tail(2).index) df_column_pre = df_column_pre.dropna() df_column = pd.concat([df_column_pre, pd.DataFrame(df_member_list)]) df_column = df_column[~df_column[0].str.contains("후보자별 득표수")] df = df.drop(index=[2, 3]) df_left = df[df.columns[:len(df_column)]] df_left = df_left.reset_index(drop=True) df_right = df[df.columns[-2:]] df_right = df_right.reset_index(drop=True) df_dataset = pd.merge(df_left, df_right, how='outer', left_index=True, right_index=True) df_column = pd.concat([df_column, pd.DataFrame(["무효 투표수", "기권수"])]) df_dataset.columns = [ *list(df_column.reset_index().drop(columns=['index'])[0]) ] # df_dataset['취소표'] = df_dataset['무효 투표수'].add(df_dataset['기권수']) df_dataset = df_dataset.drop(columns=['무효 투표수', '기권수']) df_dataset = df_dataset.drop(index=range(5, len(df_dataset)), columns=['투표구명']) return df_dataset
def _onehotItemsGenresML(self, items: DataFrame): one_hot_encoding = items["Genres"].str.get_dummies(sep='|') one_hot_encoding.drop(one_hot_encoding.columns[0], axis=1, inplace=True) tmp = items.drop(['Genres'], axis=1, inplace=False) return pd.concat([tmp, one_hot_encoding], axis=1)
def analysis(results: DataFrame): df = results.drop(['trial_iteration', 'rsi_idx'], 1) uniq_names = df['trial_name'].nunique() tests_per_name = len(df) / uniq_names cor_df = df[df['correct'] == True].groupby('trial_name')['correct'] pct_correct_by_type = cor_df.value_counts() / tests_per_name * 100 std_dev_by_type = df.groupby('trial_name')['correct'].std() adf = pct_correct_by_type.to_frame().join( std_dev_by_type, on='trial_name', lsuffix="_percent", rsuffix="_stddev").reset_index().drop('correct', 1).set_index('trial_name') adf['p-value'] = pandas.NA # f-test unw_d = df[df['trial_name'] == "Unweighted_Disalike"]['correct'] p1 = unw_d.to_numpy() for name in df['trial_name'].unique(): if name == "Unweighted_Disalike": continue other_df = df[df['trial_name'] == name]['correct'] p2 = other_df.to_numpy() f, p = ftest(p1, p2) adf.at[name, 'p-value'] = p return adf
def create_paths_models(df): # Models will be indexed in 0, 64, 128... lines = df.line.unique() jump = 64 new_df = DataFrame() for line in lines: print('line:', line) current_df = df[df.line == line] new_df = new_df.append(current_df.iloc[0]) possible_index_paths = current_df.index_path.unique() total = possible_index_paths[-1] - possible_index_paths[0] count = 0 for index_path in range(possible_index_paths[0], possible_index_paths[-1], jump): count += (1*jump) current_path = current_df[current_df.index_path == index_path] print(count/total *100) for row in current_path.iterrows(): row = row[1] if not has_distance_from_coordinate(new_df[new_df.line == line], row, distance=minimum_distance): new_df = new_df.append(row) return new_df.drop(['index_path', 'order'], axis=1)
def extract_parnoise(): # 抽取数据 path = 'D:/yansixing/tmp' source = 'parnoise_data.csv' poslist, neglist = [], [] with open(os.path.join(path, source), 'r', encoding='gbk', errors='ignore', newline='') as f: for line in f.readlines(): pos = None if ',' in line: neg, pos = tuple(line.split(',')) else: neg = line if pos is not None and len(pos.strip()) > 1: poslist.append(pos.strip()) if len(neg.strip()) > 1: neglist.append(neg.strip()) targets = [1] * len(poslist) + [0] * len(neglist) sent = poslist + neglist df = DataFrame({'target': targets, 'sent': sent}) print('Data shape : ', df.shape) print(df.head()) #pattern = re.compile(r'([\u4e00-\u9fa5])') df['sent'] = df['sent'].apply(lambda x: ''.join(w.strip() for w in re.findall(r'[\u4e00-\u9fa5]', x) if len(w.strip()) > 0)) # 使用特征抽取模式,只使用一个特征 print('Get features') def getFeature(X): m = getPikcle(os.path.join(const.PKPATH, 'lm_3_paopao_jieba.pk')) X['sent'] = X['sent'].apply(lambda x: str(x)) X['sent'] = X['sent'].apply(lambda x: ' '.join(w for w in jieba.cut(x))) X['3n_etp_n_jieba'] = X['sent'].apply(lambda x: m.entropy(ngrams(x, 3, True, True, '<s>', '</s>')) if m.entropy(ngrams(x, 3, True, True, '<s>', '</s>')) != float('inf') else -1) return X df = getFeature(df) print(df.head()) df.drop(['sent'], axis=1).to_csv(os.path.join(const.DATAPATH, 'parnoise_feats.csv')) # 使用fasttext格式 print('Get fasttext') with open(os.path.join(const.DATAPATH, 'parnoise_fasttext.txt'), 'a', encoding='utf-8', errors='ignore') as f: for x, y in zip(df['target'].values.tolist(), df['sent'].values.tolist()): line = '{0}\t__label__{1}\n'.format(y, x) f.write(line)
def splitDataRandom(data: DataFrame, split_ratio: float, drop_second: bool = False) -> Tuple[DataFrame, DataFrame]: rows = data.shape[0] pool = [x[0] for x in data.iterrows()] chosen = [] for _ in range(int(rows * split_ratio)): i = randint(0, len(pool) - 1) chosen.append(pool[i]) pool.pop(i) data1 = data.drop(pool) if drop_second: data2 = None else: data2 = data.drop(chosen) return (data1, data2)
def chooseNAttributes(data: DataFrame, n: int, class_label: str) -> DataFrame: attributes = [x for x in data.columns] attributes.remove(class_label) for i in range(n): attributes.pop(randint(0, len(attributes) - 1)) return data.drop(attributes, axis=1)
def _prepare_dataset(self, df: DataFrame) -> DataFrame: print('Preparing dataset...') df = df[['nomenclature', 'description', 'turnover']] df = df.drop(df[df.turnover.isnull()].index) df = df.fillna('') df = self._extract_unique_dataset(df) df = self._remove_rare_targets(df) print('├── Complete') return df
def pre_processing(df: DataFrame): """ input : a data frame outputs: clean data frame dtype.txt : a file that has type of each columns database:information.sqlite tables: information : clean data frame before_process : data before process missing_information : information of missing_data function output outliers : outliers data describe : describe of clean data Description: delete null information merge capital_gain and capital_loss delete education column delete outlier information with IQR method save information in database """ sql_manager = SqlManager("information.sqlite") df.to_sql(name="before_process", con=sql_manager.conn, if_exists="replace") missing_data_df = missing_data(df) missing_data_df.to_sql(name="missing_information", con=sql_manager.conn, if_exists="replace") df = df.drop(columns=[ "status_id", "status_published", 'Column1', "Column2", "Column3", "Column4" ]) main_df = df.dropna() print(main_df.shape) outliers_df, main_df = drop_numerical_outliers(main_df) main_df = main_df[main_columns] outliers_df.to_sql(name="outliers", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) main_df.to_sql(name="after_clear", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) label_encode(main_df) scaled_df = DataFrame(preprocessing.robust_scale(main_df), columns=main_columns) scaled_df.to_sql(name="information", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) print(main_df.shape) main_df.describe().to_sql(name="describe", con=sql_manager.conn, if_exists='replace') create_folder("outs") with open("outs\\dtypes.txt", "w") as file: file.write(str(main_df.dtypes)) return main_df
def testMyMetric(real, pred): metrics = eval_my_metric(real, pred, debug=1) df = DataFrame(metrics) print(df) df = df.drop(['tp', 'fp', 'fn']) spiderchart.plot(df, [0.25, .5, .75]) #plotSpiderChart(df[df.index=='precision'],[0.25,.5,.75]) #plotSpiderChart(df[df.index=='f1'],[0.25,.5,.75]) print(metrics)
def _prepare_dataset(self, df: DataFrame) -> DataFrame: print('Preparing dataset...') df = df[['object', 'financing', 'project', 'budget']] df = df.drop(df[df.budget.isnull()].index) df = df.fillna('') df = self._replace_year_specific_targets(df) df = self._extract_unique_dataset(df) df = self._remove_rare_targets(df) print('├── Complete') return df
def to_X(self, dataframe: DataFrame) -> DataFrame: dataframe = dataframe.drop(columns=self.params['Y_field'], errors='ignore') dataframe = self.X_features(dataframe) # noinspection PyProtectedMember dataframe = dataframe._get_numeric_data( ) # BUGFIX: Linear Regression Crashes if provided with non-numeric inputs dataframe = dataframe.fillna( 0) # allow X_features() to do custom fillna() first return dataframe
def drop_cols(df: DataFrame, drop_cols: List[str] = [ "Id", "Credit Score", "Purpose", "Home Ownership", "Term" ], extra_drop_cols: List[str] = []) -> DataFrame: """ Drop the columns specified; (default was ID and credit score but these can be specified manually if required) Here we also drop columns manipulated in the other functions because regardless of whether or not we 'add' in the altered version, we want to drop the 'unclean' version extra_drop_cols is just an easy way to append columns without changing the default list """ if extra_drop_cols: drop_cols.extend(extra_drop_cols) df.drop(drop_cols, axis=1, inplace=True) return df
def __init__(self, data: DataFrame, k: int, random: bool = False): self.k = k self.packs = [] test_packs = [] if random: test_packs = splitDataToPacksRandom(data, k) else: test_packs = splitDataToPacksSequencial(data, k) for test_pack in test_packs: drop_rows = [x[0] for x in test_pack.iterrows()] train_pack = data.drop(drop_rows) self.packs.append((train_pack, test_pack))
def print_cluster_scatterplot(df: DataFrame, centroids: list): """Only works for TwoDimHard dataset""" # Cluster palette colors = [ 'green', 'orange', 'blue', 'purple', 'tan', 'yellowgreen', 'royalblue', 'mediumvioletred', 'pink', 'salmon' ] # x = [] # y = [] # for i, centroid in enumerate(centroids): # # Centroid position # x.append(centroid.position[0]) # y.append(centroid.position[1]) # # Plot points in the centroid # subset = df.loc[df['_cluster'] == i] # plt.scatter(subset['X.1'], subset['X.2'], c=colors[i], s=5) # add + markers for all centroids # plt.scatter(x, y, c='red', marker='+', s=50) y = df['_cluster'] x = df.drop(['ID', '_cluster', '_distance'], axis=1) x_norm = (x - x.min()) / (x.max() - x.min()) pca = PCA(n_components=2) transformed = DataFrame(pca.fit_transform(x_norm)) # lda = LDA(n_components=2) # transformed = DataFrame(lda.fit_transform(x_norm, y)) for i, centroid in enumerate(centroids): plt.scatter(transformed[y == i][0], transformed[y == i][1], label='Class ' + str(i), c=colors[i], s=5) plt.legend() plt.show()
def get_xy(df: DataFrame, target_name: str, standardise_data: bool = False, train_size: float = 1.0, seed: int = 123) -> Results: X = df.drop(target_name, axis=1).values y = df[target_name].values.reshape(-1, 1) if train_size < 1.: Xtr, Xte, ytr, yte = train_test_split(X, y, random_state=seed, train_size=train_size, shuffle=False) else: Xtr, ytr = X, y Xte, yte = None, None if standardise_data: Xtr, Xmean, Xstd = standardise(Xtr) if Xte is not None: Xte = standardise(Xte, Xmean, Xstd) return Results((Xtr, Xte, ytr, yte), (Xmean, Xstd))
def preprocess(df: DataFrame) -> None: """Add extra attributes for tracking clusters and distances""" zeroes = np.zeros(len(df)) df = df.assign(_cluster=Series(zeroes)) df = df.assign(_distance=Series(zeroes)) # Remove columns that we don't cluster on # This is built for the wine dataset and the TwoDimHard. # It'd be nice if this was more intelligent, but alas. for column in IGNORED_COLUMNS: if column in df: df = df.drop(column, 1) # Normalize non-ID columns ids = [] for column in df.columns: if column != 'ID': ids.append(column) df[ids] = df[ids].apply(lambda x: (x - x.min()) / (x.max() - x.min())) return df
def _kernel_data_preparation(data: DataFrame, cutoff: float) -> Tuple[npt.NDArray, npt.NDArray]: """ This function will copy the data, eliminate stop codon, eliminate values lower than -1, flatten and eliminate np.nan. Will return the data in that format + the adjusted kernel. """ # Eliminate stop codon data_corrected: npt.NDArray = np.array( data.drop('*', errors='ignore').copy()) # Eliminate values lower than -1 data_corrected = data_corrected[(data_corrected >= -cutoff) & (data_corrected <= cutoff)] # Get rid of np.nan values and convert matrix into 1d matrix data_corrected = data_corrected[np.invert(np.isnan(data_corrected))] # Adjust gaussian kernel kernel_processed_data = gaussian_kde(data_corrected) return data_corrected, kernel_processed_data
class PandasBackend(DataBackend): _data: DataFrame _index: PandasIndex _loc: _LocIndexer _iloc: _ILocIndexer def __init__( self, data: Optional[Union(Series, DataFrame, dict[str, list])] = None, index: Optional[PandasIndex] = None, ) -> None: if data is None: self._data = DataFrame(dtype="object") elif type(data) is Series: self._data = cast(Series, data).to_frame().transpose() elif type(data) is DataFrame: self._data = DataFrame(data) elif type(data) is dict: sample_value = next(iter(data.values())) if not isinstance(sample_value, Iterable) or isinstance( sample_value, str): self._data = Series(data).to_frame().transpose() else: self._data = DataFrame(data) else: raise ValueError( f"Received unexpected value type {type(data)}: {data}") if index is None: self._data.index.name = "index" self._index = PandasIndex(self._data.index, []) else: if not isinstance(index, PandasIndex): index = PandasIndex(index) self._data.index = index._data self._index = index self._loc = _LocIndexer(self) self._iloc = _ILocIndexer(self) def is_link(self) -> bool: return False def link_token(self) -> Optional[DataToken]: return None def to_pandas(self) -> DataFrame: return self._data @property def columns(self) -> list[str]: return self._data.columns.tolist() @property def values(self) -> np.ndarray: data_values = self._data.values shape = data_values.shape if shape[1] == 1: return np.squeeze(data_values, axis=1) elif shape[0] == 1: return np.squeeze(data_values, axis=0) else: return data_values @property def dtypes(self) -> dict[str, DataType]: return { col: DataType(dtype) for col, dtype in self._data.dtypes.items() } def cast_columns(self, column_dtypes: dict[str, type]) -> PandasBackend: return PandasBackend(self._data.astype(column_dtypes, errors="ignore")) def to_dict(self) -> dict[str, any]: return self._data.to_dict("list") @property def index(self) -> Index: return self._index @property def index_name(self) -> Union[str, list[str]]: return self._data.index.name @property def loc(self: PandasBackend) -> LocIndexer[PandasBackend]: return self._loc @property def iloc(self: PandasBackend) -> ILocIndexer[PandasBackend]: return self._iloc def equals(self, other: PandasBackend) -> bool: if type(other) is not PandasBackend: return False return np.array_equal(self._data.values, other._data.values) and self._index.equals( other._index) def __eq__(self, other) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data == other def __ne__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data != other def __gt__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data > other def __ge__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data >= other def __lt__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data < other def __le__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data <= other def __len__(self) -> int: return len(self._data) def __iter__(self) -> Generator[str, None, None]: return iter(self._data) def iterrows(self) -> Generator[tuple[int, PandasBackend], None, None]: for i, row in self._data.iterrows(): yield (i, PandasBackend(row.to_frame().transpose())) def itertuples(self, ignore_index: bool = False): for values in self._data.itertuples(index=not ignore_index): yield values def __getitem__(self, item: str) -> Any: return PandasBackend(self._data[item].to_frame()) def getitems(self, items: list[str]) -> PandasBackend: return PandasBackend(self._data[items]) def getmask(self, mask: list[bool]) -> PandasBackend: return PandasBackend(self._data[mask]) def query(self, query: "Query") -> PandasBackend: from tanuki.database.adapter.query.pandas_query_compiler import PandasQueryCompiler query_compiler = PandasQueryCompiler(self._data) query = query_compiler.compile(query) return PandasBackend(self._data[query]) def __setitem__(self, items: str, value: Any) -> None: if isinstance(value, PandasBackend): value = value._data self._data[items] = value def get_index(self, index_alias: IndexAlias) -> Index: cols = [str(col) for col in index_alias.columns] new_data = self._data.set_index(cols) new_data.index.name = index_alias.name return PandasIndex(new_data.index, cols) def set_index(self, index: Union[Index, IndexAlias]) -> PandasBackend: cols = [str(col) for col in index.columns] new_data = self._data.set_index(cols) new_data.index.name = index.name new_index = PandasIndex(new_data.index, cols) return PandasBackend(new_data, new_index) def reset_index(self: PandasBackend) -> PandasBackend: new_data = self._data.reset_index(drop=True) new_data.index.name = "index" new_index = PandasIndex(new_data.index, []) return PandasBackend(new_data, new_index) def append( self: PandasBackend, new_backend: PandasBackend, ignore_index: bool = False, ) -> PandasBackend: return PandasBackend( self._data.append(new_backend._data, ignore_index=ignore_index)) def drop_indices(self: PandasBackend, indices: list[int]) -> PandasBackend: return PandasBackend(self._data.drop(indices)) @classmethod def concat( cls: type[PandasBackend], all_backends: list[PandasBackend], ignore_index: bool = False, ) -> PandasBackend: all_data = [backend._data for backend in all_backends] return PandasBackend(pd.concat(all_data, ignore_index=ignore_index)) def nunique(self) -> int: return self._data.nunique() def __str__(self) -> str: return str(self._data) def __repr__(self) -> str: return str(self)
continue tokens = line.split(",") domain = tokens[0] label = tokens[1] DomainLen.append(len(domain)) Numbers.append(NumCollect(domain)) Entropy.append(LettersEntropy(domain)) if label == 'notdga': Type.append(0) else: Type.append(1) traindata = {'Length': DomainLen, 'NumInDomain': Numbers, 'Entropy': Entropy, 'Type': Type} traindata = DataFrame(traindata) # 设置训练数据集 y = traindata.Type x = traindata.drop('Type', axis=1) xtrain = x ytrain = y # 读取test.txt文件 testDomainLen = [] testNumbers = [] testEntropy = [] testType = [] testDomainName = [] TestFile = open(r'test.txt') for line in TestFile: line = line.strip() if line == "": continue testDomainName.append(line) testDomainLen.append(len(line))
Created on Sun May 19 20:42:32 2013 ''' from pandas.core.frame import DataFrame from printheader import print_header cols = ['alpha','beta','gamma','delta','epsilon'] index = ['a','b','c','d','e','f'] values = [ [100, 110, 120, 130, 140], [200, 210, 220, 230, 240], [300, 310, 320, 330, 340], [400, 410, 420, 430, 440], [500, 510, 520, 530, 540], [600, 610, 620, 630, 640], ] print_header('values:') print(values, '\n\n') df = DataFrame(values, index=index, columns=cols) print_header('DataFrame df') print(df, '\n') df2 = df.drop(['beta','delta'], axis=1) print_header("After dropping beta and delta:") print(df2, '\n') print_header("After dropping rows b, c, and e") df3 = df.drop(['b','c','e'], axis=0) print(df3)
def lundong(t, temp2, temp3, temp4, mairude, maichude, zhengchangde): ''' 轮动 接受八个参数:所有交易日期,购买日期,卖出日期,无操作日期以及后面三个日期中的每个股票的长度 在购买日期 将可用资金平均分给每个股票 以开盘价格买入 在卖出日期 以收盘价卖出全部股票 在无操作日期 只需根据当天收盘价更新股票价值 ''' keyongzijin = cash #可用资金为cash chigujiazhi = 0 #初始持股价值为0 keyongzijin_1 = [] #空列表用于每次更新可用价值,下同 chigujiazhi_1 = [] chigushuliang = [] #创建空列表存储持股数量 pp = 0 kk = 0 gg = 0 tt = 0 #mairude里面的 cc = 0 #maichude里面的 zz = 0 #zhengchangde里面的 print('--------正在轮动中,初始资金为:{0}---------'.format(keyongzijin)) for i in range(len(t)): if (t.ix[i, 1] == 'T' and t.ix[i, 2] == 'F'): print('第{0}个交易日买入'.format(i)) goumaizijin = keyongzijin / mairude[tt] #购买每只股票的资金,平均分配 for l in range(mairude[tt]): chigushuliang_1 = (goumaizijin / temp3.ix[l + pp, 2]) chigushuliang.append(chigushuliang_1) keyongzijin = 0 #第一次卖完变成0 chigujiazhi = goumaizijin * mairude[tt] pp = pp + mairude[tt] tt = tt + 1 keyongzijin_1.append(keyongzijin) #将本次结果添加进储存列表 chigujiazhi_1.append(chigujiazhi) elif (t.ix[i, 1] == 'F' and t.ix[i, 2] == 'T'): print('第{0}个交易日卖出'.format(i)) chigujiazhi = 0 chucunzijin = [] for y in range(maichude[cc]): keyongzijin_2 = chigushuliang[y] * temp2.ix[y + kk, 3] chucunzijin.append(keyongzijin_2) kk = kk + maichude[cc] cc = cc + 1 keyongzijin = sum(chucunzijin) keyongzijin_1.append(keyongzijin) chigujiazhi_1.append(chigujiazhi) elif (t.ix[i, 1] == 0 and t.ix[i, 2] == 0 and t.ix[i, 3] == 0): print('第{0}个交易日,无操作'.format(i)) xianyoujiazhi = [] for z in range(zhengchangde[zz]): xianyoujiazhi_1 = chigushuliang[z] * temp4.ix[z + gg, 3] xianyoujiazhi.append(xianyoujiazhi_1) gg = gg + zhengchangde[zz] chigujiazhi = sum(xianyoujiazhi) keyongzijin_1.append(keyongzijin) chigujiazhi_1.append(chigujiazhi) '''else: print('ok') keyongzijin_1.append(keyongzijin) chigujiazhi_1.append(chigujiazhi)''' c = {"可用资金": keyongzijin_1, "持股价值": chigujiazhi_1} data1 = DataFrame(c) data1.insert(0, '日期', t['日期']) data1['总资产'] = data1['可用资金'] + data1['持股价值'] #计算总资产 data1 = data1.drop(len(data1) - 1) return data1