class Ops2: def setup(self): N = 10**3 self.df = DataFrame(np.random.randn(N, N)) self.df2 = DataFrame(np.random.randn(N, N)) self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N))) self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N))) self.s = Series(np.random.randn(N)) # Division def time_frame_float_div(self): self.df // self.df2 def time_frame_float_div_by_zero(self): self.df / 0 def time_frame_float_floor_by_zero(self): self.df // 0 def time_frame_int_div_by_zero(self): self.df_int / 0 # Modulo def time_frame_int_mod(self): self.df_int % self.df2_int def time_frame_float_mod(self): self.df % self.df2 # Dot product def time_frame_dot(self): self.df.dot(self.df2) def time_series_dot(self): self.s.dot(self.s) def time_frame_series_dot(self): self.df.dot(self.s)
def numpy_dot(): ''' Imagine a point system in which each country is awarded 4 points for each gold medal, 2 points for each silver medal, and one point for each bronze medal. Using the numpy.dot function, create a new dataframe called 'olympic_points_df' that includes: a) a column called 'country_name' with the country name b) a column called 'points' with the total number of points the country earned at the Sochi olympics. ''' countries = ['Russian Fed.', 'Norway', 'Canada', 'United States', 'Netherlands', 'Germany', 'Switzerland', 'Belarus', 'Austria', 'France', 'Poland', 'China', 'Korea', 'Sweden', 'Czech Republic', 'Slovenia', 'Japan', 'Finland', 'Great Britain', 'Ukraine', 'Slovakia', 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan'] gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0] silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0] bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1] # YOUR CODE HERE olympic_medal_counts_df = { 'gold' : Series(gold), 'silver' : Series(silver), 'bronze':Series(bronze)} vector = [4,2,1] df = DataFrame(olympic_medal_counts_df) df = df[['gold', 'silver', 'bronze']] print df points = df.dot(vector) olympic_points_df = DataFrame({'country_name': Series(countries),'points': Series(points)}) return olympic_points_df
def metabolite_distance(model, model_biomass=None, drop_metabolites=None): # Calculate metabolite-reaction distance matrix V = model.get_stoichiometric_matrix() # Remove exchange and biomass reactions V = V.drop(np.append(model.get_exchanges(check_matrix=True), model_biomass), axis=1) if model_biomass else V.drop(model.get_exchanges(check_matrix=True), axis=1) # Convert floating stoichiometric values to 1 V = DataFrame([[i if i == 0 else 1 for i in j] for j in V.values], index=V.index, columns=V.columns) # Remove highly connected metabolites V = V.drop(drop_metabolites) # Multiply stoichiometric matrix by its transpose M = V.dot(V.T).abs() # Get shortest path lengths for all metabolites G = DataFrame(nx.all_pairs_dijkstra_path_length(nx.from_numpy_matrix(M.values, create_using=nx.DiGraph()))) G = G.set_index(M.index) G.columns = M.index return G
def _measure_cos_sim(columns_set: pd.DataFrame, rows_set: pd.DataFrame): similarity_matrix = rows_set.dot(columns_set.transpose()) return similarity_matrix
class GradeBook(object): """A class encapsulating a pandas DataFrame and meant to store the grades for a whole class. It provides the method compute_total_grades that compute the total grade for each student according to a weights provided by the caller. """ def __init__(self, grade_arr, student_ids, item_list, max_scores): """ Constructor of the class grade frame: It should set the following attributes: (1) self.raw_grades, which is a DataFrame with - row labels given by student_ids - column labels given by item_list - values given by grade_arr (2) self.total_grades, set to None (3) self.letter_grades, set to None (4) self.max_scores, set to max_scores Parameters ---------- grade_arr : numpy array of grades as returned by simulate_grades student_ids: a list of student ids item_list: a list of grade items (e.g. ['HW', 'M', 'F']) max_scores: a list of the maximum possible score for each grade item Returns ------- nothing Examples -------- >>> a = GradeBook(array([[1,2],[3,4]]),['22','34'],['F','M'],[30, 50]) >>> a.letter_grades == None True >>> a.total_grades == None True >>> a.raw_grades.shape == (2,2) True >>> a.raw_grades.ix[0,0] == 1 True >>> a.max_scores[0] == 30 True """ self.raw_grades = DataFrame(grade_arr, index=student_ids, columns=item_list) self.total_grades = None self.letter_grades = None self.max_scores = max_scores def compute_total_grades(self, item_weights=None, max_score=100): """ Compute student total class grades as a weighted average of the column in self.raw_grades according to the weights passed to item_weight for each of the columns. The student total class grades are then stored in the Series attribute self.total_grades The return value should be a Series containing a numerical summary (as returned by the Series method describe) of the total class grade distribution. Parameters ---------- item_weights: list of floats summing up to one List of weights to be applied to each grade item (e.g. [0.3, 0.4, 0.3]) max_score: float Maximal possible score for the total class grade Returns ------- out : Series A Series containing a numerical summary of the total grade distribution previously stored by the function in the attribute self.total_grades; this Series is the output of the Series method describe. ---- Examples -------- >>> a = GradeBook(array([[5,5],[1,1]]),['22','34'],['F','M'],[10, 10]) >>> b = a.compute_total_grades([0.5, 0.5], 100) >>> len(b) == 5 False >>> a.total_grades['22'] == 50 True >>> a.total_grades['34'] == 10 True """ self.total_grades = self.raw_grades.dot(pd.Series(item_weights, index=self.raw_grades.columns)) percent = (1.0 / pd.Series(self.max_scores, index=self.total_grades.index) ) * max_score self.total_grades *= percent return self.total_grades.describe()
def test_dot(self): a = DataFrame(np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"]) b = DataFrame(np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"]) result = a.dot(b) expected = DataFrame(np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"]) # Check alignment b1 = b.reindex(index=reversed(b.index)) result = a.dot(b) tm.assert_frame_equal(result, expected) # Check series argument result = a.dot(b["one"]) tm.assert_series_equal(result, expected["one"], check_names=False) assert result.name is None result = a.dot(b1["one"]) tm.assert_series_equal(result, expected["one"], check_names=False) assert result.name is None # can pass correct-length arrays row = a.iloc[0].values result = a.dot(row) expected = a.dot(a.iloc[0]) tm.assert_series_equal(result, expected) with pytest.raises(ValueError, match="Dot product shape mismatch"): a.dot(row[:-1]) a = np.random.rand(1, 5) b = np.random.rand(5, 1) A = DataFrame(a) # TODO(wesm): unused B = DataFrame(b) # noqa # it works result = A.dot(b) # unaligned df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) with pytest.raises(ValueError, match="aligned"): df.dot(df2)
def plot_eff(exp_rets: pd.DataFrame, cov: pd.DataFrame, n_points: int = 25, risk_free_rate: float = .1, show_cml: bool = False, show_ew: bool = False, show_gmv: bool = False, style: str = '.-', size: tuple = (12, 6), is_return: bool = False, save: bool = False): """Imprime a fronteira eficiente, baseada nos retornos esperados e a matriz de covariância. Args: exp_rets (pd.DataFrame): retornos esperados dos ativos. cov (pd.DataFrame): matriz de covariância dos ativos. n_points (int, optional): número de pontos a serem exibidos na fronteira. Padrão: 25. risk_free_rate (float, optional): taxa livre de risco. Padrão: 0.1. show_cml (bool, optional): se True, imprime a reta que conecta o ativo livre de risco com portfólio de máximo índice de Sharpe. Padrão: False. show_ew (bool, optional): se True, imprime o portfólio de pesos iguais. Padrão: False. show_gmv (bool, optional): se True, imprime o GVM portfólio. Padrão: False. style (str, optional): estilo da linha. Padrão: '.-'. size (tuple, optional): tamanho do plot. Padrão: (12, 6). is_return (bool, optional): se True, retorna o plot, ao invés de apenas imprimí-lo. Padrão: False. save (bool, optional): se True, salva o plot em save_path com nome de gen_portfolios.png. Padrão: False. Returns: se is_return = True, retorna um ax do matplotlib. """ weights = optimal_weights(exp_rets, cov, n_points) rets = [(1 + exp_rets.dot(w))**.5 - 1 for w in weights] vols = [vol(w, cov, False) for w in weights] ef = pd.DataFrame({'Retornos': rets, 'Volatilidade': vols}) ax = ef.plot.line(x='Volatilidade', y='Retornos', style=style, figsize=size, legend=False) plt.ylabel('Retorno') if show_ew: n = exp_rets.shape[0] w_ew = np.repeat(1 / n, n) r_ew = (1 + exp_rets.dot(w_ew))**.5 - 1 v_ew = vol(w_ew, cov, False) ax.plot([v_ew], [r_ew], color='goldenrod', marker='o', markersize=10, label='EW') if show_gmv: w_gmv = gmv(cov) r_gmv = (1 + exp_rets.dot(w_gmv))**.5 - 1 v_gmv = vol(w_gmv, cov, False) ax.plot([v_gmv], [r_gmv], color='midnightblue', marker='o', markersize=10, label='GMV') if show_cml: ax.set_xlim(left=0) w_msr = maximize_sr(exp_rets, cov, risk_free_rate) r_msr = (1 + exp_rets.dot(w_msr))**.5 - 1 v_msr = vol(w_msr, cov, False) # add capital market line cml_x = [0, v_msr] cml_y = [risk_free_rate, r_msr] ax.plot(cml_x, cml_y, color='green', marker='o', linestyle='dashed', markersize=10, linewidth=2, label='Cap. Market Line') plt.legend() if save: plt.savefig(save_path + 'gen_portfolios.png', dpi=200) return ax if is_return else plt.show()
def _measure_cos_sim(train_set: pd.DataFrame, test_set: pd.DataFrame): similarity_matrix = test_set.dot(train_set.transpose()) return similarity_matrix
class VSVM(object): data = [] # Raw sentences __labels__ = [] # Labels for sentences vect_data = [] # Vectorized sentences x_test = [] # FOR y_test = [] # TESTING x_test_text = [] # Raw test sentences # Loading of pre-trained model and some necessary things def __init__(self): with open('TFIDFVectModel.pkl', 'rb') as input_file: self.__vect = load(input_file) # Vectorizer model with open('TFIDFMatrix.pkl', 'rb') as input_file: self.__tf_idf = load(input_file) # TF-IDF matrix with open('Dictionary.pkl', 'rb') as input_file: self.__dict = load(input_file) # Dictionary of all words in matrix def __repr__(self): return (f'VSvm,\n' f'{self.vect()},\n' f'{self.model()}.') #Returns fited vectorizer model def vect(self): return (self.__vect) #Returns filled Tf-Idf matrix def matr(self): return (self.__tf_idf) #Upload fited svm model def upload_model(self, path='GermanSVMModel2-3.pkl'): try: with open(path, 'rb') as input_file: self.__model = load(input_file) except Exception as ex: raise ex #Returns fited svm model (before it use upload_model) def model(self): try: return (self.__model) except Exception as ex: raise ex #Builds TF-IDF matrix on given labeled data, also creates vectorizer model and dictionary #Takes two lists of strings(one with some sentences, another with their labels) and #list with stop-words suitable for laguage of sentences def transform(self, sentences_list, labels, stop_words): # Converts all labels to string type try: labels = [str(x) for x in labels] except TypeError as te: te.args = [ 'Type error exception occured.\nPlease check your \'labels\' variable. It should be list type, not numeral' ] raise te # Refreshes class field with new labels self.__labels__ = labels # Creates list with unique labels tags = [el for el, _ in groupby(sorted([tag for tag in labels]))] # Creates dictionary and fills it with empty space sentences_by_tag = dict() for tag in tags: sentences_by_tag[tag] = [] try: if len(sentences_list) != len(labels): warnings.warn( 'For correct results of algorithm variables \'sentences_list\' and \'labels\'' 'should be the same length') except TypeError as te: te.args = [ 'Type error exception occured.\nPlease check your \'sentences_list\' variable. It should be list type, not numeral' ] raise te # Fills dictionary from 10 strings abow with all sentences for each label for i in zip(sentences_list, labels): sentences_by_tag[i[1]].append(i[0]) # Variable for all sentences, sorted by their label documents = [' '.join(x) for x in sentences_by_tag.values()] # Removes everything except letters and tabs and spaces from variable for i in range(len(documents)): documents[i] = re.sub(r'[^\w\s]+|[\d]+', "", documents[i], flags=re.UNICODE) # Variable for Tf-Idf vectorizer self.__vect = TfidfVectorizer(lowercase=True, ngram_range=(2, 3), stop_words=stop_words) # Creates and fills Tf-Idf matrix with all sentences (documents) try: matrix = self.__vect.fit_transform(documents) except ValueError as ve: ve.args = [ 'Value error exception occured.\nPlease check your variables \'sentences_list\' and \'labels\'.' 'They Should be not empty.\nAlso they should be list type.\n' '\'sentences_list\' should contain something that contains not only numerals' ] raise ve except TypeError as te: te.args = [ 'Type error exception occured.\nPlease make sure that your stop-words variable is a list' ] raise te except MemoryError as me: me.args = [ 'Memory error exception occured.\nPlease check your pagefile size and increase it' ] raise me # Dictionary - like variable that contains positions of each word in Tf-Idf matrix positions = {} j = 0 for word in self.__vect.get_feature_names(): positions[word] = j j = j + 1 # Refreshes class field with new data self.__dict = positions self.__tf_idf = DataFrame(matrix.toarray(), columns=positions.keys(), index=sentences_by_tag.keys()) #Fits SVC model from sklearn using existing TF-IDF matrix #Takes two lists of strings (one (x) with some sentences, another (y) with their labels) def fit(self, x, y): # Refreshes class field with new data self.data = x # Run function that transformates sentences to vectors with length of number of labels self.__create_vectors() # Variable for svm model svm = SVC(C=10, kernel='linear', probability=True) # Fits model try: svm.fit(self.vect_data, y) except TypeError as te: te.args = [ 'Type error exception occured.\n' 'Probably your answers (y variable) contains objects of different types.\n' 'Please check it out and if the suspicions are confirmed - lead objects to the same type.\n' 'Also the error can be occured if \'y\' variable is empty' ] raise te # Refreshes class field with new data self.__model = svm #Builds TF-IDF matrix on given labeled data, then fits SVC model using it #Takes two lists of strings(one with some sentences, another with their labels) and #list with stop-words suitable for laguage of sentences def fit_transform(self, sentences_list, labels, stop_words): # Refreshes class field with new data and runs transformtion function try: self.__labels__ = labels # Runs function that fits TF-IDF vectorizer and creates TF-IDF matrix from input data self.transform(sentences_list, self.__labels__, stop_words) except Exception as ex: raise ex # Splits data into test and train parts try: x_train, x_test, y_train, y_test = train_test_split( sentences_list, self.__labels__, test_size=0.125, random_state=228) except ValueError as ve: ve.args = [ f'\'sentences_list\' and \'labels\' should be the same size\n' f'Got {len(sentences_list)} and {len(self.__labels__)}' ] raise ve self.y_test = y_test self.data = x_test self.x_test_text = self.data.copy() self.x_test_text = self.x_test_text.reset_index(drop=True) # Runs function that transformates sentences to vectors with length of number of labels try: self.__create_vectors() except Exception as ex: return (logging.error(ex, exc_info=True)) self.x_test = self.vect_data # Runs function that fits svm model with vectorized data and labels for them try: self.fit(x_train, y_train) except Exception as ex: raise ex #Labels given data (list of sentences) based on the TF-IDF matrix #Takes one list of strings(with some sentences) def predict(self, sentences_list): self.data = sentences_list self.__create_vectors() return (DataFrame(self.data, columns=["sentence"]).join( DataFrame(self.__model.predict(self.vect_data), columns=["prediction"]))) #Gives the list of vectors of probability in labels #(each vector is a set of numbers from 0 to 1) #Takes one list of strings(with some sentences) def predict_proba(self, sentences_list): self.data = sentences_list self.__create_vectors() return self.__model.predict_proba(self.vect_data) #Labels given data (list of sentences) based on the probability list #(use the same list of sentences as in prediction of the probability) #Takes a list of strings(with some sentences), a list of vectors (with probability of labels) and can take the probability threshold and the name of the trash label (tag) def interpretate_proba(self, sentences_list, proba, threshold=0.0969, trash_tag='000'): # Creates list of interpretated labels interp = [] # Fills list with labels with the maximum probability and passing threshold try: i = 0 for i in range(len(proba)): maximum = max(proba[i]) if (maximum > threshold): interp.append(self.__model.classes_[list( proba[i]).index(maximum)]) else: interp.append(trash_tag) except LookupError as le: le.args = [ f'Vectors in \'proba\' and model classes should be the same size\n' f'Got {len(proba[i])} and {len(self.__model.classes_)}' ] raise le interpretated = DataFrame(sentences_list) interpretated.columns = ['sentence'] return interpretated.join(DataFrame(interp, columns=['label'])) #Prints information about fited model, such as accuracy, precision, recall and information about multiclass classification def class_rep(self): print( classification_report(self.y_test, self.model().predict(self.x_test), target_names=self._labels_)) #Draw graphics that visualize correlation between human labeled text and machine labeled ones #Takes Pandas DataFrame that contains some corpuses in Manifesto standart view(with columns: #document name(contains date, country, party) and code) def visualize_pearson(self, data): # Splits data into test and train parts (train part dropped) _, x_test, _, y_test = train_test_split(data['doc_name'], data['code'], test_size=0.125, random_state=228) y_test.reset_index(drop=True, inplace=True) # Prepares data as annotated by human or computer hum_annot = DataFrame(y_test) hum_annot['doc_name'] = x_test.reset_index(drop=True) hum_annot.columns = ['code', 'doc_name'] comp_annot = DataFrame(self.model().predict(self.x_test)) comp_annot['doc_name'] = x_test.reset_index(drop=True) comp_annot.columns = ['code', 'doc_name'] # Draws Pearson correlation self.__pears(data['doc_name', 'code'], comp_annot, hum_annot) #Downlaods some manifesto texts in specific format: csv file, header: manifesto_name, content, label #You can find them in C:\Users\User\ManifestoDetails\ #Takes dictionary with some of that params: # params = { # 'language': 'german', # 'election_date': '2017' # } def get_manifesto_texts(self, params): # Check current versions of cores and load them from their folder cores = self.__get_current_cores() # Check existence of meta data file try: open(r'ManifestoDetails\meta.csv') except Exception as ex: ex.args = [ 'No meta file found! Please use function get_manifesto_meta or check ManifestoDetails folder' ] # If cores and meta were find downloads texts from Manifesto database to ManifestoDetails folder if cores != []: ManifestoAPI.get_texts(params, r'ManifestoDetails\meta.csv', r'ManifestoDetails\annot.csv', r'ManifestoDetails\not_annot.csv') print( 'Texts were successfully download. You can find them in your user folder, ManifestoDetails, annot.csv and not_annot.csv' ) else: print( 'No cores found! Please download some fo them using method \'get_manifesto_cores\'' ) #Downloads some manifesto meta data that uses to download some texts. #You can find it in C:\Users\User\ManifestoDetails\meta.csv #Takes dictionary with some of that params: # params = { # "countryname": "Germany" # } def get_manifesto_meta(self, params): # Check current versions of cores and load them from their folder cores = self.__get_current_cores() # If cores were find downloads meta data from Manifesto database to ManifestoDetails folder if cores != []: ManifestoAPI.get_manifesto_metadata(params, cores, r'ManifestoDetails\meta.csv') print( 'Meta data was successfully download.' 'You can find them in your user folder, ManifestoDetails, meta\.csv' ) else: print( 'No cores found! Please download some fo them using method \'get_manifesto_cores\'' ) #Downloads some manifesto cores of specific versions #exmaple of versions: ['MPDS2018b', 'MPDSSA2018b'] #You can get full list of current versions by using "get_core_versions" def get_manifesto_cores(self, versions): try: for i in versions: ManifestoAPI.api_get_core(i, kind='xlsx') except Exception as ex: ex.args = [ 'Please make sure that you gave correct versions of manifesto cores. To make sure use \'get_core_versions\' method' ] #Prints all current manifesto core versions def get_core_versions(self): print(ManifestoAPI.api_list_core_versions()) #Check folder of cores and returns all that were find def __get_current_cores(self): # Directory for Manifesto cores pdir = r'ManifestoDetails\cores' # List for all files in folder contdir = [] # "Walking" through content of directory and saving all file names for i in walk(pdir): contdir.append(i) # Grabing useful information(core versions) from walking result cores = [] for i in contdir[0][2]: cores.append(re.split('\.', i)[0]) return (cores) #Function that draws Pearson correlation chart for computer and human annotated texts #Takes Pandas DataFrame that contains some corpuses in Manifesto standart view(with columns that contains such info as #document name and code) and #two other Pandas DataFrame that contains some pairs of document and his label(code), one of them contains algorithm predictions and #another - human labels def __pears(self, data, comp_annot, hum_annot): # Creates and fills lists with label frequency in document comp = [] hum = [] for i in data['doc_name'].unique(): for j in data['code'].unique(): comp.append(comp_annot[(comp_annot.doc_name == i) & (comp_annot.code == j)].shape[0]) hum.append(hum_annot[(hum_annot.doc_name == i) & (hum_annot.code == j)].shape[0]) # Draws Pearson correlation pear = sns.jointplot(x=array(comp), y=array(hum), kind='reg') pear = pear.set_axis_labels("computer-annotated", "human-annotated") pear = pear.annotate(stats.pearsonr) print(pear) #Changes the vector's length to 1 #Takes list with n floats, where n is number of labels in Your data def __normalize(self, vector): normed = linalg.norm(vector) if normed == 0: return vector return vector / normed #Converts sentence to an array of float numbers using the TF-IDF matrix #Takes one sentence that must be string type. #Also you need to run transform method before using this one def __to_vector(self, sentence): # Variable for computing likelihood of entry into the label of sentence vector = zeros(len(self.__dict)) # Check type of input data if isinstance(sentence, str): # Splits sentence into words splitted_sentence = simple_preprocess(str(sentence), deacc=True) # Removes empty spaces splitted_sentence = list(filter(None, splitted_sentence)) # If sentence have two or more words tries to find each phrase from two words in TF-IDF vocabulary # and increments appropriate position if len(splitted_sentence) > 1: for c in range(len(splitted_sentence) - 1): pair = splitted_sentence[c] + ' ' + splitted_sentence[c + 1] try: position = self.__dict[pair] vector[position] += 1 except KeyError: continue # If sentence have three or more words tries to find each phrase from two words in TF-IDF vocabulary # and increments appropriate position if len(splitted_sentence) > 2: for c in range(len(splitted_sentence) - 2): tripl = splitted_sentence[c] + ' ' + splitted_sentence[ c + 1] + splitted_sentence[c + 2] try: position = self.__dict[tripl] vector[position] += 1 except KeyError: continue return (self.__normalize(array(self.__tf_idf.dot(vector)))) #Runs To_vector for a list of sentences (which is supposed to be in self.data) #Takes nothing, but You need to put some data in exemplar of that class or just run transform or fit_transform with correct args #before using this method def __create_vectors(self): # Variable for vectorized sentences vectors = [] # Variable for active progress bar j = 1 # Variable for amount of sentences x = len(self.data) # Construction for active progress bar print("Vectorization in progress:") # Vectorizing each sentence for i in self.data: vectors.append(self.__to_vector(i)) line = str(j) + '/' + str(x) print(line, end="\r") j += 1 # Refreshes class field with new data self.vect_data = vectors
def epochs_sim_agg_returns_pair_data(dataframe: pd.DataFrame, normalized: bool = False) -> List[float]: """Uses local normalization to compute the aggregated distribution of returns for a pair of simulated stocks. :param dataframe: dataframe with the simulated returns. :type dataframe: pd.DataFrame :param normalized: normalize the returns within the epochs, defaults to False :type normalized: bool, optional :return: simulated rotated returns. :rtype: List[float] """ if normalized: dataframe = (dataframe - dataframe.mean()) / dataframe.std() cov_two_col: pd.DataFrame = dataframe.cov() # eig_vec: eigenvector, eig_val: eigenvalues eig_val_corr: np.ndarray eig_vec_corr: np.ndarray eig_val_corr, eig_vec_corr = np.linalg.eigh(cov_two_col) # rot: rotation, scale: scaling rot: np.ndarray scale: np.ndarray rot, scale = eig_vec_corr, np.diag(1 / np.sqrt(eig_val_corr)) # trans: transformation matrix # trans = rot . scale trans: np.ndarray = rot.dot(scale) try: # Transform the returns trans_col: pd.DataFrame = dataframe.dot(trans) # Length DataFrame col_length: int = len(trans_col.columns) # Name the columns with the used stocks trans_col.columns = [f"Stock_{i}" for i in range(col_length)] one_col: List[Any] = [] for idx in range(col_length): one_col.append(trans_col[f"Stock_{idx}"]) agg_ret_mkt_series: pd.Series = pd.concat(one_col, ignore_index=True) del one_col del trans_col except np.linalg.LinAlgError as error: print(error) print() del one_col del trans_col # remove NaN and Inf agg_ret_mkt_list: List[float] = [ x for x in agg_ret_mkt_series if not math.isnan(x) and not math.isinf(x) ] # filter out values greater than 10 or smaller than -10 agg_ret_mkt_list = [x for x in agg_ret_mkt_list if -10 <= x <= 10] return agg_ret_mkt_list
def plot_eff( exp_rets: pd.DataFrame, cov: pd.DataFrame, n_points: int, risk_free_rate: float=.1, show_cml: bool=False, show_ew: bool=False, show_gmv: bool=False, style: str='.-', size: tuple=(12, 6), is_return: bool=False, save: bool=False ): """ Plots the N-asset efficient frontier. """ weights = optimal_weights(exp_rets, cov, n_points) rets = [(1 + exp_rets.dot(w)) ** .5 - 1 for w in weights] vols = [vol(w, cov, True) for w in weights] ef = pd.DataFrame({'Retornos': rets, 'Volatilidade': vols}) ax = ef.plot.line(x='Volatilidade', y='Retornos', style=style, figsize=size) if show_ew: n = exp_rets.shape[0] w_ew = np.repeat(1/n, n) r_ew = (1 + exp_rets.dot(w_ew)) ** .5 - 1 v_ew = vol(w_ew, cov, True) ax.plot([v_ew], [r_ew], color='goldenrod', marker='o', markersize=10) if show_gmv: w_gmv = gmv(cov) r_gmv = (1 + exp_rets.dot(w_gmv)) ** .5 - 1 v_gmv = vol(w_gmv, cov, True) ax.plot([v_gmv], [r_gmv], color='midnightblue', marker='o', markersize=10) if show_cml: ax.set_xlim(left=0) w_msr = maximize_sr(exp_rets, cov, risk_free_rate) r_msr = (1 + exp_rets.dot(w_msr)) ** .5 - 1 v_msr = vol(w_msr, cov, True) # add capital market line cml_x = [0, v_msr] cml_y = [risk_free_rate, r_msr] ax.plot( cml_x, cml_y, color='green', marker='o', linestyle='dashed', markersize=10, linewidth=2 ) if save: plt.savefig(save_path + 'gen_portfolios_hd.png', dpi=200) if not is_return: plt.show() else: return ax