class DataFrameStringIndexing(object): goal_time = 0.2 def setup(self): index = tm.makeStringIndex(1000) columns = tm.makeStringIndex(30) self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object) def time_get_value(self): with warnings.catch_warnings(record=True): self.df.get_value(self.idx_scalar, self.col_scalar) def time_ix(self): self.df.ix[self.idx_scalar, self.col_scalar] def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] def time_getitem_scalar(self): self.df[self.col_scalar][self.idx_scalar] def time_boolean_rows(self): self.df[self.bool_indexer] def time_boolean_rows_object(self): self.df[self.bool_obj_indexer]
def calculate_deltas(snps: pd.DataFrame) -> pd.DataFrame: columns = ["#rsID", "genotype", "delta", "chr"] global delta_rsIDs index = 0 print("Analyzing chromosome {}".format( snps.get_value(snps.index.values[index], "chr"))) for idx, row in snps.iterrows(): # print(row) # print(row[0]) deltas = [row[0] - row[3], row[1] - row[4], row[2] - row[5]] # print(deltas) max_delta = np.argmax(deltas) if max_delta == 0: genotype = "11" elif max_delta == 1: genotype = "12" else: genotype = "22" temp_array = np.array([idx, genotype, deltas[max_delta], snps.get_value( snps.index.values[index], "chr")]) print(temp_array) temp_df = pd.DataFrame(data=temp_array) delta_rsIDs.append(temp_df) index += 1 print("Finished analyzing {}".format( snps.get_value(snps.index.values[0], "chr"))) print(delta_rsIDs.columns) return delta_rsIDs
class DataFrameStringIndexing: def setup(self): index = tm.makeStringIndex(1000) columns = tm.makeStringIndex(30) self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object) def time_get_value(self): with warnings.catch_warnings(record=True): self.df.get_value(self.idx_scalar, self.col_scalar) def time_ix(self): self.df.ix[self.idx_scalar, self.col_scalar] def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] def time_getitem_scalar(self): self.df[self.col_scalar][self.idx_scalar] def time_boolean_rows(self): self.df[self.bool_indexer] def time_boolean_rows_object(self): self.df[self.bool_obj_indexer]
def compute_tf_idf_queries(self): # Find total number of document results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents')) tmp = results.fetchone() total_doc = tmp[0] results = self.cursor.execute('SELECT did, total_word, path FROM documents') tmp = results.fetchall() documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path']) documents_df['tf_idf'] = 0.0 no_docterm = {} for query in self.queries: no_docterm[query] = 0 for index, row in documents_df.iterrows(): path = row['path'] with codecs.open(path, 'rt') as f: text = f.read() for query in self.queries: if query in text.decode('utf-8').lower(): no_docterm[query] += 1 for query in self.queries: for index, row in documents_df.iterrows(): total_word = row['total_word'] path = row['path'] with codecs.open(path, 'rt') as f: text = f.read() tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query]) cur_tf_idf = documents_df.get_value(index, 'tf_idf') documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf) results = self.cursor.execute('SELECT did, type, entity FROM entities') tmp = results.fetchall() df = DataFrame(tmp, columns=['did', 'e_type', 'entity']) df['tf_idf'] = 0.0 for index, row in df.iterrows(): did = row['did'] tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0] df.set_value(index, 'tf_idf', tf_idf) del df['did'] df = df.groupby(['e_type', 'entity']).sum().reset_index() return df
def set_targets_from_file(self, filename = None, year = None): ''' Loads targets from file and display them in the frame ''' if year is None: year = str(CONF.get('simulation','datesim').year) if filename is None: fname = "actualisation_groups.h5" data_dir = CONF.get('paths', 'data_dir') filename = os.path.join(data_dir, fname) store = HDFStore(filename) # Builds openfisca variables from irpp declaration variables df_c = store["corresp"] of_vars = dict() for col in df_c.columns: of_vars[col] = list(unique(df_c[col]).dropna()) df_a = store['amounts'] df_b = store['benef'] store.close() df_a1 = DataFrame( {'amount' : df_a[year]}) df_a = DataFrame( columns = ['amount'] ) for of_var, declar_vars_list in of_vars.iteritems(): amount = 0 for case in declar_vars_list: a = df_a1.get_value(case, 'amount') if a is not NaN: amount += a df_a1.drop(declar_vars_list, axis = 0, inplace=True) row = DataFrame(dict(amount = [amount]), index = [of_var] ) df_a = df_a.append(row) df_a = df_a.append(df_a1) self.vars_df = df_a self.vars_df.index.names = ['var'] self.fill_vars() self.fill_coeffs()
def _get_markup(self, para, markup_dict=None): """get markup """ if not markup_dict: markup_dict = self._MARKUPS df = DataFrame(markup_dict, index=['Enter', 'Exit']).T df['In']=False sects=[] place=0 while place > -1: place = -1 markup = None estr = None for mark, enter in df[df.In==False].Enter.iterkv(): find = para.find(enter) if find > -1 and (find<=place or place==-1): if find == place and len(enter) < len(estr): continue place = find markup = mark estr = enter for mark, exit in df[df.In==True].Exit.iterkv(): find = para.find(exit) if find > -1 and (find<=place or place==-1): if find == place and len(exit) < len(estr): continue place = find markup = mark estr = exit if place > -1: sects.append([para[:place], df[df.In==True].index.tolist()]) df.set_value(markup, 'In', not df.get_value(markup, 'In')) para = para[place+len(estr):] if df.In.any(): raise ValueError( 'the markup does not exit from;\n{}'.format(df[df.In==True])) sects.append([para, []]) return sects
#%% d['a'] # Series,列 #%% d[['a','c']] # DataFrame,列 #%% d[:5] # DataFrame,行 #%% d.ix[:5] # position-based,行 #%% d1.ix[:5] # label-based,行 #%% d.irow(0) # Series #%% d.icol(0) # Series #%% d.get_value('e','a') # get_value(row_name,col_name) #%% 强制使用位置来访问元素的方法 d.iget_value(0,1) # iget_value(irow,icol) #%% 使用条件过滤 d[d>5] #%% d[d.a>5] #%% d[(d>5)&(d%3==0)] #%% 使用条件过滤的本质 d>5 # DataFrame #%% d.a>5 # Series #%% 可以自己构造一个Series
class BuildLda: def __init__(self, print_list=True): # Create dictionary self.dictionary = Dictionary() self.topics = ['Topic {}'.format(i) for i in range(1,31)] self.print_list = print_list def build_object(self): self.build_model() self.transform_set() self.build_nearest_neighbours() def build_model(self): if self.print_list: print('Building LDA') strings = JobDescription.objects.values('url', 'body') data_samples = [] seen_strings = set() for string in strings: if string['body'] not in seen_strings: seen_strings.add(string['body']) data_samples.append({'url': string['url'], 'string': self.dictionary.clean_string(string['body'])}) self.data_samples = DataFrame(data_samples) n_features = 10000 n_topics = 15 n_top_words = 10 max_iter = 40 self.tf_vectorizer = CountVectorizer(max_features=n_features, stop_words='english') tf = self.tf_vectorizer.fit_transform(self.data_samples['string']) self.lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=max_iter, learning_method='online') self.lda.fit(tf) if self.print_list: print() print("\nTopics in LDA model:") tf_feature_names = self.tf_vectorizer.get_feature_names() self.create_word_topics(self.lda, tf_feature_names) if self.print_list: self.print_top_words(self.lda, tf_feature_names, n_top_words) def test_single_doc(self, string): data_samples = DataFrame([{'string': self.dictionary.clean_string(string)}]) test = self.tf_vectorizer.transform(data_samples['string']) lda_result = self.lda.transform(test) top_tags = [] return_value = {'lda_result': lda_result, 'tags': []} index_set = sorted(range(len(lda_result[0])), key=lambda i: lda_result[0][i], reverse=True) position = 0 for index in index_set: return_value['tags'].append({'tag': self.topics[index], 'position': position, 'score': lda_result[0][index]}) top_tags.append(self.topics[index]) position += 1 return return_value def transform_set(self): if self.print_list: print('Getting LDA Transformation') vectorizor_data = self.tf_vectorizer.transform(self.data_samples['string']) self.results = self.lda.transform(vectorizor_data) def build_nearest_neighbours(self): if self.print_list: print('Build Nearest Neighbours') self.nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(self.results) def get_neighbours(self, string, print=False): return_result = self.test_single_doc(string) return_result['distances'], return_result['indices'] = self.nbrs.kneighbors(return_result['lda_result']) if print: self.print_neighbours(return_result['indices'][0]) return_result['neighbours'] = self.return_neighbours(return_result['indices'][0], return_result['distances'][0]) return {'tags': return_result['tags'], 'neighbours': return_result['neighbours']} def print_neighbours(self, indices): print('Closest 10 jobs:') for indice in indices: url = self.data_samples.get_value(indice, 'url') print('http://www.seek.com.au%s' % url) def return_neighbours(self, indices, distances): return_value = [] for index in range(len(indices)): url = self.data_samples.get_value(indices[index], 'url') return_value.append({'url': 'http://www.seek.com.au{}'.format(url), 'distance': distances[index]}) return return_value def print_top_words(self, model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): print(self.topics[topic_idx]+": "+" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print() def create_word_topics(self, model, feature_names): for topic_idx, topic in enumerate(model.components_): self.topics[topic_idx] = "_".join([feature_names[i] for i in topic.argsort()[:-3 - 1:-1]])
class Reuters(): def __init__(self, data_root='./reuters21578/', topic_file_name='all-topics-strings.lc.txt'): self._data_root = data_root self._topic_file_name = topic_file_name self._number_of_files = 22 self._file_name_prefix = 'reut2-' self._create_stat_template() self._data = DataFrame(columns=['Id', 'Topic', 'Set', 'Body', 'TFIDF']) def _create_stat_template(self): topics_stats = [] with open(self._data_root + self._topic_file_name, 'r') as topic_file: for topic in topic_file.readlines(): topic = topic.strip() topics_stats.append([topic, 'TRAIN', 0]) topics_stats.append([topic, 'TEST', 0]) topics_stats.append([topic, 'NOT-USED', 0]) topics_stats.append([topic, 'USABLE', 0]) self._article_stats = DataFrame(data=topics_stats, columns=['Topic', 'Set', 'Count']) def _remove_tags(self, text: str): return re.sub('<[^<]+?>', '', text).strip() def _update_stats_field(self, topic: str, set_class: str): idx = self._article_stats[self._article_stats.Topic == topic][ self._article_stats.Set == set_class].index[0] f = self._article_stats.get_value(idx, 'Count') self._article_stats.set_value(idx, 'Count', f + 1) def _update_stats(self, topic: str, set_class: str): self._update_stats_field(topic, set_class) if set_class in ['TEST', 'TRAIN']: self._update_stats_field(topic, 'USABLE') def _unescape(self, text: str): return saxutils.unescape(text) def _newslines(self): # for i in range(1): for i in range(self._number_of_files): file_id = '00' + str(i) if i < 10 else '0' + str(i) print("processing file {}".format(file_id)) with open( self._data_root + self._file_name_prefix + file_id + '.sgm', 'r') as file: content = BeautifulSoup(file.read().lower()) for newsline in content('reuters'): yield newsline def _matrix_to_list(self, data: list): return [np.squeeze(np.asarray(item.todense())) for item in data] def get_news_stats(self, mode='offline') -> DataFrame: """ :param mode: if offline, loads stats from disk returns stats of number of available news for each set :return datafram of stats with ['Topic', 'Set', 'Count'] as Columns """ if mode == 'offline': self._article_stats = DataFrame.from_csv('news_stats.csv', sep='\t') return self._article_stats for newsline in self._newslines(): set_class = newsline.attrs['lewissplit'].upper() topics = newsline.topics.contents for topic in topics: topic_cleaned = self._remove_tags(str(topic)).strip() self._update_stats(topic=topic_cleaned, set_class=set_class) return self._article_stats def load_data(self): """ Loads all the data from txt files to dataframe :return: """ for newsline in self._newslines(): document_id = newsline['newid'] set_class = newsline.attrs['lewissplit'].upper() if set_class not in ['TRAIN', 'TEST']: continue # News text document_body = self._remove_tags(str( newsline('text')[0].text)).replace('reuter\n', '').replace('\t', ' ') document_body = self._unescape(document_body.strip()) # News topics topics = newsline.topics.contents for topic in topics: topic_cleaned = self._remove_tags(str(topic)).strip() self._data = self._data.append( { 'Id': document_id, 'Topic': topic_cleaned, 'Set': set_class, 'Body': document_body, 'TFIDF': [] }, ignore_index=True) def get_all_train(self): """ Returen all TRAIN data to calculate TFIDF :return: """ return self._data[self._data.Set == 'TRAIN'].Body.values.tolist() def add_tfidf(self, tfidf_classifier: TFIDFClassifier): """ Adds TFIDF to the dataframe to avoid multiple calculation of TFIDF :param tfidf_classifier: TFIDF classifier that can do word-to-tfidf conversion """ document_matrix = tfidf_classifier.to_tfidf( self._data.Body.values.tolist()) for index, row in enumerate(document_matrix): self._data.set_value(index, 'TFIDF', document_matrix[index]) def get_data(self, topic: str, set: str): """ getting vectorized TFIDF equivalent of news with belonging to specific topic and set :param topic: news topic. e.g. acq :param set: TRAIN or TEST :return: list of TFIDF in csr_matrix form """ positive_examples = self._data[self._data.Topic == topic][ self._data.Set == set].TFIDF.values.tolist() negative_examples = self._data[self._data.Topic != topic][ self._data.Set == set].TFIDF.values.tolist() all_examples = self._matrix_to_list(positive_examples + negative_examples) labels = [0] * (len(positive_examples) + len(negative_examples)) labels[0:len(positive_examples)] = [1] * len(positive_examples) return all_examples, labels
class DataFrameModel(QAbstractTableModel): ''' data model for a DataFrame class ''' def __init__(self): super(DataFrameModel, self).__init__() self.df = DataFrame() def setDataFrame(self, dataFrame): self.df = dataFrame def signalUpdate(self): ''' tell viewers to update their data (this is full update, not efficient)''' self.layoutChanged.emit() #------------- table display functions ----------------- def headerData(self, section, orientation, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if orientation == Qt.Horizontal: try: return self.df.columns.tolist()[section] except (IndexError, ): return QVariant() elif orientation == Qt.Vertical: try: # return self.df.index.tolist() return self.df.index.tolist()[section] except (IndexError, ): return QVariant() def data(self, index, role=Qt.DisplayRole): if role != Qt.DisplayRole: return QVariant() if not index.isValid(): return QVariant() # gde 2014.02.19 - original implementation only worked # if there were no missing indices. Instead use get_value #return QVariant(str(self.df.ix[index.row(), index.column()])) row = self.df.index[index.row()] col = self.df.columns[index.column()] str_value = str(self.df.get_value(row, col)) return QVariant(str_value) def flags(self, index): flags = super(DataFrameModel, self).flags(index) flags |= Qt.ItemIsEditable return flags def setData(self, index, value, role): row = self.df.index[index.row()] col = self.df.columns[index.column()] if hasattr(value, 'toPyObject'): # PyQt4 gets a QVariant value = value.toPyObject() else: # PySide gets an unicode dtype = self.df[col].dtype if dtype != object: value = None if value == '' else dtype.type(value) self.df.set_value(row, col, value) return True def rowCount(self, index=QModelIndex()): return self.df.shape[0] def columnCount(self, index=QModelIndex()): return self.df.shape[1]
class Scores(AnnotationMixin, object): """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df( cls, df, uri=None, modality=None, aggfunc=np.mean ): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ A = cls(uri=uri, modality=modality) A._df = pivot_table( df, values=PYANNOTE_SCORE, rows=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], cols=PYANNOTE_LABEL, aggfunc=aggfunc ) return A def __init__(self, uri=None, modality=None): super(Scores, self).__init__() index = MultiIndex( levels=[[], []], labels=[[], []], names=[PYANNOTE_SEGMENT, PYANNOTE_TRACK] ) self._df = DataFrame(index=index, dtype=np.float64) self.modality = modality self.uri = uri self._timelineHasChanged = True # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key): if isinstance(key, Segment): segment = key self._df = self._df.drop(segment, axis=0) self._timelineHasChanged = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self._df = self._df.drop((segment, track), axis=0) self._timelineHasChanged = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): segment, track, label = key return self._df.get_value((segment, track), label) def get_track_scores(self, segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return {l: self._df.get_value((segment, track), l) for l in self._df} # scores[segment, track, label] = value def __setitem__(self, key, value): segment, track, label = key self._df = self._df.set_value((segment, track), label, value) self._timelineHasChanged = True def labels(self, unknown=True): """List of labels Parameters ---------- unknown : bool, optional When False, do not return Unknown instances When True, return any label (even Unknown instances) Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ labels = sorted(self._df.columns, key=str) if unknown: return labels else: return [l for l in labels if not isinstance(l, Unknown)] def itervalues(self): """Iterate over annotation as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._df = self._df.sort_index() # yield one (segment, track, label) tuple per loop labels = self._df.columns for (segment, track), columns in self._df.iterrows(): for label in labels: value = columns[label] if np.isnan(value): continue else: yield segment, track, label, value def _rank(self, invert): if invert: direction = 1. else: direction = -1. def nan_rank(data): # replace NaN by -inf or +inf depending on the requested direction finite = np.isfinite(data) fixed = np.where(finite, direction*data, -direction*np.inf) # do the actual argsort indices = np.argsort(fixed) # get rank from argsort rank = np.argsort(indices) # special treatment for inverted NaN scores # (we want ranks to start at 0 even in case of NaN) if invert: rank = np.where(finite, rank-(len(data)-np.sum(finite)), np.nan) else: rank = np.where(finite, rank, np.nan) return rank return self._df.apply(nan_rank, axis=1) def rank(self, invert=False): """ Parameters ---------- invert : bool, optional By default, larger scores are better. Set `invert` to True to indicate smaller scores are better. Returns ------- rank : `Scores` """ A = self.__class__(uri=self.uri, modality=self.modality) A._df = self._rank(invert) return A def nbest(self, n, invert=False): """ Parameters ---------- n : int Size of n-best list invert : bool, optional By default, larger scores are better. Set `invert` to True to indicate smaller scores are better. Returns ------- nbest : `Scores` New scores where only n-best are kept. """ df = self._df.copy() nbest = self._rank(invert) < n df[~nbest] = np.nan A = self.__class__(uri=self.uri, modality=self.modality) A._df = df return A def subset(self, labels, invert=False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) A = self.__class__(uri=self.uri, modality=self.modality) A._df = self._df[list(labels)] return A def to_annotation(self, threshold=-np.inf, posterior=False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ annotation = Annotation(uri=self.uri, modality=self.modality) if not self: return annotation best = self.nbest(1, invert=False) if posterior: # compute unknown posterior func = lambda p: 1. - np.nansum(p, axis=1) Pu = self.apply(func, new_columns=['_']) # threshold best target posterior # with unknown posterior and threshold for segment, track, label, value in best.itervalues(): if value < Pu[segment, track, '_'] or value < threshold: label = Unknown() annotation[segment, track] = label else: # threshold best target score with threshold for segment, track, label, value in best.itervalues(): if value < threshold: label = Unknown() annotation[segment, track] = label return annotation def map(self, func): """Apply function to all values""" A = self.__class__(uri=self.uri, modality=self.modality) A._df = func(self._df) return A def apply(self, data_func, new_index=None, new_columns=None): """Apply `data_func` on internal numpy array Parameters ---------- data_func : func Function expecting (index x columns) numpy array as input new_index : iterable, optional When provided, these will be the index of returned array. new_columns : iterable, optional When provided, these will be the columns of returned array. """ new_data = data_func(self._df.values) if new_index is None: new_index = self._df.index if new_columns is None: new_columns = self._df.columns df = DataFrame( data=new_data, index=new_index, columns=new_columns) new_scores = self.__class__(uri=self.uri, modality=self.modality) new_scores._df = df return new_scores def _repr_png_(self): from pyannote.core.notebook import repr_scores return repr_scores(self)
class Aggregates(object): filter_by = None labels = collections.OrderedDict(( ('var', u"Mesure"), ('entity', u"Entité"), ('dep', u"Dépenses\n(millions d'€)"), ('benef', u"Bénéficiaires\n(milliers)"), ('dep_default', u"Dépenses initiales\n(millions d'€)"), ('benef_default', u"Bénéficiaires\ninitiaux\n(milliers)"), ('dep_real', u"Dépenses\nréelles\n(millions d'€)"), ('benef_real', u"Bénéficiaires\nréels\n(milliers)"), ('dep_diff_abs', u"Diff. absolue\nDépenses\n(millions d'€)"), ('benef_diff_abs', u"Diff absolue\nBénéficiaires\n(milliers)"), ('dep_diff_rel', u"Diff. relative\nDépenses"), ('benef_diff_rel', u"Diff. relative\nBénéficiaires"), )) # TODO: localize show_default = False show_diff = True show_real = True survey_scenario = None totals_df = None varlist = None def __init__(self, survey_scenario=None): if survey_scenario is not None: self.set_survey_scenario(survey_scenario) def clear(self): self.totals_df = None def compute(self): """ Compute the whole table """ self.compute_aggregates(self.filter_by) self.load_amounts_from_file() self.compute_real() self.compute_diff() def compute_aggregates(self, filter_by=None): """ Compute aggregate amounts """ column_by_name = self.simulation.tax_benefit_system.column_by_name V = [] M = {'data': [], 'default': []} B = {'data': [], 'default': []} U = [] M_label = { 'data': self.labels['dep'], 'default': self.labels['dep_default'] } B_label = { 'data': self.labels['benef'], 'default': self.labels['benef_default'] } for var in self.varlist: # amounts and beneficiaries from current data and default data if exists montant_benef = self.get_aggregate(var, filter_by) V.append(column_by_name[var].label) entity = column_by_name[var].entity_key_plural U.append(entity) for dataname in montant_benef: M[dataname].append(montant_benef[dataname][0]) B[dataname].append(montant_benef[dataname][1]) # build items list items = [(self.labels['var'], V)] for dataname in M: if M[dataname]: items.append((M_label[dataname], M[dataname])) items.append((B_label[dataname], B[dataname])) items.append((self.labels['entity'], U)) aggr_frame = DataFrame.from_items(items) self.aggr_frame = None for code, label in self.labels.iteritems(): try: col = aggr_frame[label] if self.aggr_frame is None: self.aggr_frame = DataFrame(col) else: self.aggr_frame = self.aggr_frame.join(col, how="outer") except: pass def compute_diff(self): ''' Computes and adds relative differences ''' dep = self.aggr_frame[self.labels['dep']] benef = self.aggr_frame[self.labels['benef']] if self.show_default: ref_dep_label, ref_benef_label = self.labels[ 'dep_default'], self.labels['benef_default'] if ref_dep_label not in self.aggr_frame: return elif self.show_real: ref_dep_label, ref_benef_label = self.labels[ 'dep_real'], self.labels['benef_real'] else: return ref_dep = self.aggr_frame[ref_dep_label] ref_benef = self.aggr_frame[ref_benef_label] self.aggr_frame[self.labels['dep_diff_rel']] = (dep - ref_dep) / abs(ref_dep) self.aggr_frame[self.labels['benef_diff_rel']] = ( benef - ref_benef) / abs(ref_benef) self.aggr_frame[self.labels['dep_diff_abs']] = dep - ref_dep self.aggr_frame[self.labels['benef_diff_abs']] = benef - ref_benef def compute_real(self): ''' Adds administrative data to dataframe ''' if self.totals_df is None: return A, B = [], [] for var in self.varlist: # totals from administrative data if var in self.totals_df.index: A.append(self.totals_df.get_value(var, "amount")) B.append(self.totals_df.get_value(var, "benef")) else: A.append(nan) B.append(nan) self.aggr_frame[self.labels['dep_real']] = A self.aggr_frame[self.labels['benef_real']] = B def create_description(self): ''' Creates a description dataframe ''' now = datetime.now() return DataFrame([ u'OpenFisca', u'Calculé le %s à %s' % (now.strftime('%d-%m-%Y'), now.strftime('%H:%M')), u'Système socio-fiscal au %s' % self.simulation.period.start, u"Données d'enquêtes de l'année %s" % str(self.simulation.input_table.survey_year), ]) def get_aggregate(self, variable, filter_by=None): """ Returns aggregate spending, and number of beneficiaries for the relevant entity level Parameters ---------- variable : string name of the variable aggregated according to its entity """ simulation = self.simulation column_by_name = self.simulation.tax_benefit_system.column_by_name column = column_by_name[variable] weight_name = self.weight_column_name_by_entity_key_plural[ column.entity_key_plural] filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural) # amounts and beneficiaries from current data and default data if exists # Build weights for each entity data = DataFrame({ variable: simulation.calculate(variable), weight_name: simulation.calculate(weight_name), }) data_default = None datasets = {'data': data} if data_default is not None: datasets['default'] = data_default filter_indicator = True if filter_by: filtered_data = DataFrame({ variable: simulation.calculate(variable), weight_name: simulation.calculate(weight_name), filter_by_name: simulation.calculate(filter_by_name), }) data_default = None filter_indicator = filtered_data[filter_by_name] m_b = {} weight = data[weight_name] * filter_indicator for name, data in datasets.iteritems(): amount = data[variable] benef = data[variable].values != 0 try: total_amount = int(round(sum(amount * weight) / 10**6)) except: total_amount = nan try: total_benef = int(round(sum(benef * weight) / 10**3)) except: total_benef = nan m_b[name] = [total_amount, total_benef] return m_b def load_amounts_from_file(self, filename=None, year=None): ''' Loads totals from files ''' if year is None: year = self.year if filename is None: data_dir = DATA_DIR try: filename = os.path.join(data_dir, "amounts.h5") store = HDFStore(filename) df_a = store['amounts'] df_b = store['benef'] store.close() self.totals_df = DataFrame(data={ "amount": df_a[year] / 10**6, "benef": df_b[year] / 1000, }) row = DataFrame({'amount': nan, 'benef': nan}, index=['logt']) self.totals_df = self.totals_df.append(row) # Add some aditionnals totals for col in ['amount', 'benef']: # Deals with logt logt = 0 for var in ['apl', 'alf', 'als']: logt += self.totals_df.get_value(var, col) self.totals_df.set_value('logt', col, logt) # Deals with rsa rmi rsa = 0 for var in ['rmi', 'rsa']: rsa += self.totals_df.get_value(var, col) self.totals_df.set_value('rsa', col, rsa) # Deals with irpp, csg, crds for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']: if col in ['amount']: val = -self.totals_df.get_value(var, col) self.totals_df.set_value(var, col, val) except: # raise Exception(" No administrative data available for year " + str(year)) import warnings warnings.warn( "No administrative data available for year %s in file %s" % (str(year), filename)) self.totals_df = None return def save_table(self, directory=None, filename=None, table_format=None): ''' Saves the table to some format ''' now = datetime.now() if table_format is None: if filename is not None: extension = filename[-4:] if extension == '.xls': table_format = 'xls' elif extension == '.csv': table_format = 'csv' else: table_format = 'xls' if directory is None: directory = "." if filename is None: filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format) fname = os.path.join(directory, filename) try: df = self.aggr_frame if table_format == "xls": writer = ExcelWriter(str(fname)) df.to_excel(writer, "aggregates", index=False, header=True) descr = self.create_description() descr.to_excel(writer, "description", index=False, header=False) writer.save() elif table_format == "csv": df.to_csv(fname, "aggregates", index=False, header=True) except Exception, e: raise Exception("Aggregates: Error saving file", str(e))
import pandas as pd from pandas import DataFrame import numpy as np # load the datasets names = ['dealerId', 'gender'] df = DataFrame(pd.read_csv("./GenderPythonTest.csv", sep=',', names=names)) # Probablistic network build-up bayesNetwork = [[], [], []] counter = -1 for idx in df['dealerId'].index: featureName = df.get_value(idx, 'dealerId') if featureName not in bayesNetwork[0]: if counter != -1 and bayesNetwork[1][counter] + bayesNetwork[2][ counter] == 0: #extreme case, the data is fault, hence overwrite the previous feature bayesNetwork[0][counter] = featureName else: bayesNetwork[0].append(featureName) bayesNetwork[1].append(0) bayesNetwork[2].append(0) counter += 1 gender = df.get_value(idx, 'gender') if gender == 'male': bayesNetwork[1][counter] += 1 elif gender == 'female': bayesNetwork[2][counter] += 1 # For visualization viz_data = {
class Instancia: ''' Classe que implementa uma instancia do cenário em que se resolve o problema de otimização de rotas veiculares ''' def __init__(self, conf, name_file): ''' Recebe o nome do arquivo de instância para inicializar a classe ''' instancia_path = path.join(conf['instancia_path'], name_file) with open(instancia_path, 'r') as file: d = loads(file.read()) file.close() self.instancia_path = instancia_path self.__requests = d['requests'] self.__static_data = d['static_data'] self.__service_time = d['static_data']['service_time'] self.__df_total = DataFrame(self.__requests) self.n = len(self.__requests) self.m = d['static_data']['number_of_vehicles'] self.Q = d['static_data']['max_vehicle_capacity'] self.T = d['static_data']['total_time'] self.u_mean = d['static_data']['urgency_mean'] self.u_std = d['static_data']['urgency_std'] self.dynamism = d['static_data']['dynamism'] self.dp_ratio = d['static_data']['dp_ratio'] self.priori_ratio = d['static_data']['priori_ratio'] self.deposito_x = 0 self.deposito_y = 0 def __get_distance(self, fonte, antro): ''' Função genérica de distância entre 2 pontos a partir dos dados de um DataFrame ''' x1 = self.__df_total.get_value(fonte - 1, "service_point_x") y1 = self.__df_total.get_value(fonte - 1, "service_point_y") x2 = self.__df_total.get_value(antro - 1, "service_point_x") y2 = self.__df_total.get_value(antro - 1, "service_point_y") return round(sqrt((x1 - x2)**2 + (y1 - y2)**2), 2) def __get_distance_deposito(self, local): ''' Função genérica de distância entre 1 ponto e o local do depósito a partir dos dados de um DataFrame ''' x1 = self.__df_total.get_value(local - 1, "service_point_x") y1 = self.__df_total.get_value(local - 1, "service_point_y") x2 = self.deposito_x y2 = self.deposito_y return round(sqrt((x1 - x2)**2 + (y1 - y2)**2), 2) def __get_base(self, initial=None, offset=False, item=None): ''' Função genérica para captar dados simples do json e formar um dicionário ''' if not initial: initial = {} for i, r in enumerate(self.__requests): if offset and type(offset) == int: index = i + 1 + offset else: index = i + 1 if type(item) == int: initial[index] = item elif type(item) == str: initial[index] = r[item] else: pass return initial ''' Todas as funções do tipo 'get_x(self)' retornam os dados estáticos do problema. Substituindo 'x' por as seguintes opções, temos: q -> Embarque/Desembarque de passageiros por local s -> Tempo de embarque/desembarque de passageiros no local t -> Instante desejado do atendimento do pedido nas origens W -> Tempo máximo de espera pelo atendimento nas origens R -> Tempo máximo de viagem no atendimento em cada origem O -> Todas as origens de pedidos D -> Todos os destinos de pedidos V -> Todos os vértices do grafo criado K -> Todos os veículos ''' def get_q(self): d = {0: 0, 2 * self.n + 1: 0} d = self.__get_base(d, offset=False, item=1) return self.__get_base(d, offset=self.n, item=-1) def get_s(self): d = {0: 0, 2 * self.n + 1: 0} d = self.__get_base(d, offset=False, item=self.__service_time) return self.__get_base(d, offset=self.n, item=self.__service_time) def get_t(self): return self.__get_base(offset=False, item="desired_time") def get_W(self): return self.__get_base(offset=False, item="max_wait_time") def get_R(self): return self.__get_base(offset=False, item="max_ride_time") def get_O(self): return [o + 1 for o in range(self.n)] def get_D(self): return [o + self.n + 1 for o in range(self.n)] def get_V(self): return [0] + self.get_O() + self.get_D() + [2 * self.n + 1] def get_K(self): return range(self.m) def get_T(self): return self.T def get_urgency(self): return self.u_mean, self.u_std def get_dynamism(self): return self.dynamism def get_dp_ratio(self): return self.dp_ratio def get_priori_ratio(self): return self.priori_ratio def get_req(self): return self.__requests def get_static_data(self): return self.__static_data def get_tau(self): ''' Retorna uma estrutura de dados do tipo dicionário, onde: Index, do tipo tupla (a,b), representa o arco entre 'a' e 'b' Value, do tipo float c, representa o tempo de viagem no arco ''' # Tempo de viagem do veículo que fica parado é nulo tau = {(0, 2 * self.n + 1): 0} # Dataframes utilizados para encontrar pedidos # e diferenciar entre drops e picks df_total = DataFrame(self.__requests) df_drops = df_total.loc[df_total["service_type"] == "drop"] df_picks = df_total.loc[df_total["service_type"] == "pick"] id_drops = [x + 1 for x in list(df_drops.index.values)] id_picks = [x + 1 for x in list(df_picks.index.values)] pedidos = list(df_total.index.values) # Um grafo é um dicionário de listas graph = defaultdict(list) def addEdge(g, u, v): g[u].append(v) # Para todos os pedidos for pedido in pedidos: origem_pedido, destino_pedido = pedido + 1, pedido + 1 + self.n # Adiciona arcos do depósito para a origem # do destino para o depósito # e entre a origem e o destino addEdge(graph, 0, origem_pedido) addEdge(graph, destino_pedido, 2 * self.n + 1) addEdge(graph, origem_pedido, destino_pedido) addEdge(graph, destino_pedido, origem_pedido) # Para todos os outros pedidos for outro_pedido in pedidos: # (se for o mesmo, ignora) if pedido == outro_pedido: pass else: origem_outro_pedido, destino_outro_pedido = outro_pedido + 1, outro_pedido + 1 + self.n # Adiciona arcos entre as origens # entre os destinos # entre a origem do primeiro e o destino do segundo # e entre o destino do primeiro e a origem do segundo addEdge(graph, origem_pedido, origem_outro_pedido) addEdge(graph, destino_pedido, destino_outro_pedido) addEdge(graph, origem_pedido, destino_outro_pedido) addEdge(graph, destino_pedido, origem_outro_pedido) def genEdge(graph): edges = [] for node in graph: for neighbour in graph[node]: edges.append((node, neighbour)) return edges # Arcos são tuplas entre nós, nomeados 'fonte' e 'antro' for arco in genEdge(graph): fonte = arco[0] antro = arco[1] fonte_depo = False antro_depo = False if (fonte == 0) or (fonte > self.n and fonte - self.n in id_picks) or (fonte <= self.n and fonte in id_drops): fonte_depo = True if antro == 2 * self.n + 1 or (antro > self.n and antro - self.n in id_picks) or (antro <= self.n and antro in id_drops): antro_depo = True if fonte > self.n: fonte = fonte - self.n if antro > self.n: antro = antro - self.n if (fonte_depo, antro_depo) == (True, True): tau[arco] = 0 elif (fonte_depo, antro_depo) == (True, False): tau[arco] = self.__get_distance_deposito(antro) elif (fonte_depo, antro_depo) == (False, True): tau[arco] = self.__get_distance_deposito(fonte) else: tau[arco] = self.__get_distance(fonte, antro) return tau def get_pos_requests(self): ''' Retorna uma lista com tuplas que representam os pedidos, incluindo os seguintes dados ordenados: id do pedido (int) posição x do pedido (float) posição y do pedido (float) tipo do pedido (str) Método usado na criação de imagems que apresentam os resultados obtidos ''' data = [] columns = [ "service_point_x", "service_point_y", "service_type", "desired_time" ] for i, r in enumerate(list(self.__df_total[columns].values)): id_pedido = i + 1 x, y, t, d = float(r[0]), float(r[1]), str(r[2]), int(r[3]) data.append((id_pedido, x, y, t, d)) return data
def fit(self, annotations): """ Parameters ---------- annotations : (Annotation, Annotation) iterator Returns ------- """ # possible_match[n, m] is the total possible match duration # when there are n A-tracks & m B-tracks possible_match = DataFrame() # actual_match[n, m] is the total actual match duration # when there are n A-tracks & m B-tracks actual_match = DataFrame() # overlap[n, m] is the total duration # when there are n A-tracks & m B-tracks overlap = DataFrame() for n, (A, B) in enumerate(annotations): assert isinstance(A, Annotation), "%r is not an Annotation" % A assert isinstance(B, Annotation), "%r is not an Annotation" % B if n == 0: self.modalityA = A.modality self.modalityB = B.modality else: assert A.modality == self.modalityA, \ "bad modality (%r, %r)" % (self.modalityA, A.modality) assert B.modality == self.modalityB, \ "bad modality (%r, %r)" % (self.modalityB, B.modality) assert A.uri == B.uri, \ "resource mismatch (%r, %r)" % (A.uri, B.uri) timeline, a, b = self._AB2ab(A, B) for segment in timeline: duration = segment.duration # number of tracks atracks = a.tracks(segment) Na = len(atracks) btracks = b.tracks(segment) Nb = len(btracks) if Na == 0 or Nb == 0: continue # number of matching tracks N = len(a.get_labels(segment) & b.get_labels(segment)) # increment possible_match & actual_match try: p_m = possible_match.get_value(Na, Nb) a_m = actual_match.get_value(Na, Nb) ovl = overlap.get_value(Na, Nb) except Exception, e: p_m = 0. a_m = 0. ovl = 0. possible_match = possible_match.set_value(Na, Nb, p_m + min(Na, Nb)*duration) actual_match = actual_match.set_value(Na, Nb, a_m + N*duration) overlap = overlap.set_value(Na, Nb, ovl + duration)
for i in range(row): for j in range(col): myList = matrix[i][j] #print myList list_iter = 0 current = datetime.combine(start_day, start_time) while current < end: no_crimes = 0 day_end = current + delta while list_iter < len(myList) and myList[list_iter] < day_end: no_crimes += 1 list_iter += 1 temp = df.get_value(current.date(), 'freq') + no_crimes df.set_value(current.date(), 'freq', temp) day_matrix[i][j].append(no_crimes) current += delta count_matrix[i][j] = len(myList) total += len(myList) #print count_matrix #print(total) #print day_matrix[2][2] """ for i in range(row): for j in range(col): print str(int(math.floor(count_matrix[i][j]))),
def cal_portfolio_returns(self, cal_period = 1, cal_type = 'momentum', index_type = '300', return_type = 'period', cal_percent = [0.01]): #判定指数类型 if index_type == '300': df_index = self.df_index_300 index_prices = self.index_prices_300 elif index_type == '500': df_index = self.df_index_500 index_prices = self.index_prices_500 elif index_type == '800': df_index = self.df_index_800 index_prices = self.index_prices_800 #修改指数信息的列的数据类型 index_levels = pd.to_datetime(df_index.columns.get_level_values(0).unique()) df_index.columns.set_levels(index_levels, level=0, inplace=True) # 计算组合股票数量 cal_num = [int(float(index_type) * x) for x in cal_percent] #获取成分股信息 df_code = df_index.xs(key='code', axis=1, level=1) # 计算各股票收益率 stock_returns = self.df_price.pct_change(cal_period) # 计算指数收益率 index_total_returns = index_prices.pct_change(cal_period) period_list = stock_returns.index # 计算策略组合收益情况 portfolio_returns = DataFrame() index_cum_returns = Series() for i in range(cal_period, len(period_list) - cal_period, cal_period): # 计算收益时间点指数成分股的收益率 cal_date = period_list[i] index_returns = stock_returns[df_code[cal_date]].ix[cal_date, :] # 对指数成分股的收益率进行排序 if cal_type == 'momentum': index_returns.sort_values(ascending=False, inplace=True) elif cal_type == 'crossovers': index_returns.sort_values(ascending=True, inplace=True) for num in cal_num: # 选取上一期收益高(低)的成分股组合 portfolio = index_returns[0:num].index # 计算组合平均收益率 return_date = period_list[i + cal_period] portfolio_return = Series.mean(stock_returns[portfolio].ix[return_date, :]) # 计算分期超额收益率 if return_type == 'period': excess_return = portfolio_return - index_total_returns[return_date] # 计算累计超额收益率 elif return_type == 'cum': if i > cal_period: index_return = index_cum_returns[cal_date] * (1 + index_total_returns[return_date]) cum_return = portfolio_returns.get_value(index=cal_date, col=num / float(index_type)) portfolio_return = (1 + portfolio_return) * (index_cum_returns[cal_date] + cum_return) else: index_return = 1 + index_total_returns[return_date] portfolio_return = 1 + portfolio_return excess_return = portfolio_return - index_return index_cum_returns.set_value(label=return_date, value=index_return) portfolio_returns.set_value(index=return_date, col=num / float(index_type), value=excess_return) portfolio_returns.columns=[str(x*100) + '%' for x in cal_percent] self.portfolio_returns=portfolio_returns
class Aggregates(object): filter_by = None labels = collections.OrderedDict(( ('var', u"Mesure"), ('entity', u"Entité"), ('dep', u"Dépenses\n(millions d'€)"), ('benef', u"Bénéficiaires\n(milliers)"), ('dep_default', u"Dépenses initiales\n(millions d'€)"), ('benef_default', u"Bénéficiaires\ninitiaux\n(milliers)"), ('dep_real', u"Dépenses\nréelles\n(millions d'€)"), ('benef_real', u"Bénéficiaires\nréels\n(milliers)"), ('dep_diff_abs', u"Diff. absolue\nDépenses\n(millions d'€)"), ('benef_diff_abs', u"Diff absolue\nBénéficiaires\n(milliers)"), ('dep_diff_rel', u"Diff. relative\nDépenses"), ('benef_diff_rel', u"Diff. relative\nBénéficiaires"), )) # TODO: localize show_default = False show_diff = True show_real = True survey_scenario = None totals_df = None varlist = None def __init__(self, survey_scenario = None): if survey_scenario is not None: self.set_survey_scenario(survey_scenario) def clear(self): self.totals_df = None def compute(self): """ Compute the whole table """ self.compute_aggregates(self.filter_by) self.load_amounts_from_file() self.compute_real() self.compute_diff() def compute_aggregates(self, filter_by = None): """ Compute aggregate amounts """ column_by_name = self.simulation.tax_benefit_system.column_by_name V = [] M = {'data': [], 'default': []} B = {'data': [], 'default': []} U = [] M_label = {'data': self.labels['dep'], 'default': self.labels['dep_default']} B_label = {'data': self.labels['benef'], 'default': self.labels['benef_default']} for var in self.varlist: # amounts and beneficiaries from current data and default data if exists montant_benef = self.get_aggregate(var, filter_by) V.append(column_by_name[var].label) entity = column_by_name[var].entity_key_plural U.append(entity) for dataname in montant_benef: M[dataname].append(montant_benef[dataname][0]) B[dataname].append(montant_benef[dataname][1]) # build items list items = [(self.labels['var'], V)] for dataname in M: if M[dataname]: items.append((M_label[dataname], M[dataname])) items.append((B_label[dataname], B[dataname])) items.append((self.labels['entity'], U)) aggr_frame = DataFrame.from_items(items) self.aggr_frame = None for code, label in self.labels.iteritems(): try: col = aggr_frame[label] if self.aggr_frame is None: self.aggr_frame = DataFrame(col) else: self.aggr_frame = self.aggr_frame.join(col, how="outer") except: pass def compute_diff(self): ''' Computes and adds relative differences ''' dep = self.aggr_frame[self.labels['dep']] benef = self.aggr_frame[self.labels['benef']] if self.show_default: ref_dep_label, ref_benef_label = self.labels['dep_default'], self.labels['benef_default'] if ref_dep_label not in self.aggr_frame: return elif self.show_real: ref_dep_label, ref_benef_label = self.labels['dep_real'], self.labels['benef_real'] else: return ref_dep = self.aggr_frame[ref_dep_label] ref_benef = self.aggr_frame[ref_benef_label] self.aggr_frame[self.labels['dep_diff_rel']] = (dep - ref_dep) / abs(ref_dep) self.aggr_frame[self.labels['benef_diff_rel']] = (benef - ref_benef) / abs(ref_benef) self.aggr_frame[self.labels['dep_diff_abs']] = dep - ref_dep self.aggr_frame[self.labels['benef_diff_abs']] = benef - ref_benef def compute_real(self): ''' Adds administrative data to dataframe ''' if self.totals_df is None: return A, B = [], [] for var in self.varlist: # totals from administrative data if var in self.totals_df.index: A.append(self.totals_df.get_value(var, "amount")) B.append(self.totals_df.get_value(var, "benef")) else: A.append(nan) B.append(nan) self.aggr_frame[self.labels['dep_real']] = A self.aggr_frame[self.labels['benef_real']] = B def create_description(self): ''' Creates a description dataframe ''' now = datetime.now() return DataFrame([ u'OpenFisca', u'Calculé le %s à %s' % (now.strftime('%d-%m-%Y'), now.strftime('%H:%M')), u'Système socio-fiscal au %s' % self.simulation.period.start, u"Données d'enquêtes de l'année %s" % str(self.simulation.input_table.survey_year), ]) def get_aggregate(self, variable, filter_by = None): """ Returns aggregate spending, and number of beneficiaries for the relevant entity level Parameters ---------- variable : string name of the variable aggregated according to its entity """ simulation = self.simulation column_by_name = self.simulation.tax_benefit_system.column_by_name column = column_by_name[variable] weight_name = self.weight_column_name_by_entity_key_plural[column.entity_key_plural] filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural) # amounts and beneficiaries from current data and default data if exists # Build weights for each entity data = DataFrame( { variable: simulation.calculate_add(variable), weight_name: simulation.calculate(weight_name), } ) data_default = None datasets = {'data': data} if data_default is not None: datasets['default'] = data_default filter_indicator = True if filter_by: filtered_data = DataFrame( { variable: simulation.calculate(variable), weight_name: simulation.calculate(weight_name), filter_by_name: simulation.calculate(filter_by_name), } ) data_default = None filter_indicator = filtered_data[filter_by_name] m_b = {} weight = data[weight_name] * filter_indicator for name, data in datasets.iteritems(): amount = data[variable] benef = data[variable].values != 0 try: total_amount = int(round(sum(amount * weight) / 10 ** 6)) except: total_amount = nan try: total_benef = int(round(sum(benef * weight) / 10 ** 3)) except: total_benef = nan m_b[name] = [total_amount, total_benef] return m_b def load_amounts_from_file(self, filename = None, year = None): ''' Loads totals from files ''' if year is None: year = self.year if filename is None: data_dir = DATA_DIR try: filename = os.path.join(data_dir, "amounts.h5") store = HDFStore(filename) df_a = store['amounts'] df_b = store['benef'] store.close() self.totals_df = DataFrame(data = { "amount": df_a[year] / 10 ** 6, "benef": df_b[year] / 1000, }) row = DataFrame({'amount': nan, 'benef': nan}, index = ['logt']) self.totals_df = self.totals_df.append(row) # Add some aditionnals totals for col in ['amount', 'benef']: # Deals with logt logt = 0 for var in ['apl', 'alf', 'als']: logt += self.totals_df.get_value(var, col) self.totals_df.set_value('logt', col, logt) # Deals with rsa rmi rsa = 0 for var in ['rmi', 'rsa']: rsa += self.totals_df.get_value(var, col) self.totals_df.set_value('rsa', col, rsa) # Deals with irpp, csg, crds for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']: if col in ['amount']: val = - self.totals_df.get_value(var, col) self.totals_df.set_value(var, col, val) except: # raise Exception(" No administrative data available for year " + str(year)) import warnings warnings.warn("No administrative data available for year %s in file %s" % (str(year), filename)) self.totals_df = None return def save_table(self, directory = None, filename = None, table_format = None): ''' Saves the table to some format ''' now = datetime.now() if table_format is None: if filename is not None: extension = filename[-4:] if extension == '.xls': table_format = 'xls' elif extension == '.csv': table_format = 'csv' else: table_format = 'xls' if directory is None: directory = "." if filename is None: filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format) fname = os.path.join(directory, filename) try: df = self.aggr_frame if table_format == "xls": writer = ExcelWriter(str(fname)) df.to_excel(writer, "aggregates", index= False, header= True) descr = self.create_description() descr.to_excel(writer, "description", index = False, header=False) writer.save() elif table_format == "csv": df.to_csv(fname, "aggregates", index= False, header = True) except Exception, e: raise Exception("Aggregates: Error saving file", str(e))