Пример #1
0
class DataFrameStringIndexing(object):

    goal_time = 0.2

    def setup(self):
        index = tm.makeStringIndex(1000)
        columns = tm.makeStringIndex(30)
        self.df = DataFrame(np.random.randn(1000, 30), index=index,
                            columns=columns)
        self.idx_scalar = index[100]
        self.col_scalar = columns[10]
        self.bool_indexer = self.df[self.col_scalar] > 0
        self.bool_obj_indexer = self.bool_indexer.astype(object)

    def time_get_value(self):
        with warnings.catch_warnings(record=True):
            self.df.get_value(self.idx_scalar, self.col_scalar)

    def time_ix(self):
        self.df.ix[self.idx_scalar, self.col_scalar]

    def time_loc(self):
        self.df.loc[self.idx_scalar, self.col_scalar]

    def time_getitem_scalar(self):
        self.df[self.col_scalar][self.idx_scalar]

    def time_boolean_rows(self):
        self.df[self.bool_indexer]

    def time_boolean_rows_object(self):
        self.df[self.bool_obj_indexer]
Пример #2
0
def calculate_deltas(snps: pd.DataFrame) -> pd.DataFrame:
    columns = ["#rsID", "genotype", "delta", "chr"]
    global delta_rsIDs
    index = 0
    print("Analyzing chromosome {}".format(
        snps.get_value(snps.index.values[index], "chr")))
    for idx, row in snps.iterrows():
        # print(row)
        # print(row[0])
        deltas = [row[0] - row[3], row[1] - row[4], row[2] - row[5]]
        # print(deltas)
        max_delta = np.argmax(deltas)
        if max_delta == 0:
            genotype = "11"
        elif max_delta == 1:
            genotype = "12"
        else:
            genotype = "22"
        temp_array = np.array([idx, genotype, deltas[max_delta], snps.get_value(
            snps.index.values[index], "chr")])
        print(temp_array)
        temp_df = pd.DataFrame(data=temp_array)
        delta_rsIDs.append(temp_df)
        index += 1
    print("Finished analyzing {}".format(
        snps.get_value(snps.index.values[0], "chr")))
    print(delta_rsIDs.columns)
    return delta_rsIDs
Пример #3
0
class DataFrameStringIndexing:
    def setup(self):
        index = tm.makeStringIndex(1000)
        columns = tm.makeStringIndex(30)
        self.df = DataFrame(np.random.randn(1000, 30),
                            index=index,
                            columns=columns)
        self.idx_scalar = index[100]
        self.col_scalar = columns[10]
        self.bool_indexer = self.df[self.col_scalar] > 0
        self.bool_obj_indexer = self.bool_indexer.astype(object)

    def time_get_value(self):
        with warnings.catch_warnings(record=True):
            self.df.get_value(self.idx_scalar, self.col_scalar)

    def time_ix(self):
        self.df.ix[self.idx_scalar, self.col_scalar]

    def time_loc(self):
        self.df.loc[self.idx_scalar, self.col_scalar]

    def time_getitem_scalar(self):
        self.df[self.col_scalar][self.idx_scalar]

    def time_boolean_rows(self):
        self.df[self.bool_indexer]

    def time_boolean_rows_object(self):
        self.df[self.bool_obj_indexer]
Пример #4
0
    def compute_tf_idf_queries(self):
        # Find total number of document
        results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents'))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute('SELECT did, total_word, path FROM documents')
        tmp = results.fetchall()
        documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path'])
        documents_df['tf_idf'] = 0.0

        no_docterm = {}

        for query in self.queries:
            no_docterm[query] = 0

        for index, row in documents_df.iterrows():
            path = row['path']
            with codecs.open(path, 'rt') as f:
                text = f.read()
                for query in self.queries:
                    if query in text.decode('utf-8').lower():
                        no_docterm[query] += 1

        for query in self.queries:
            for index, row in documents_df.iterrows():
                total_word = row['total_word']
                path = row['path']

                with codecs.open(path, 'rt') as f:
                    text = f.read()

                tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query])
                cur_tf_idf = documents_df.get_value(index, 'tf_idf')
                documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf)

        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])
        df['tf_idf'] = 0.0

        for index, row in df.iterrows():
            did = row['did']
            tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0]
            df.set_value(index, 'tf_idf', tf_idf)

        del df['did']
        df = df.groupby(['e_type', 'entity']).sum().reset_index()
        return df
Пример #5
0
    def set_targets_from_file(self, filename = None, year = None):
        '''
        Loads targets from file and display them in the frame
        '''

        if year is None:
            year     = str(CONF.get('simulation','datesim').year)

        if filename is None:
            fname = "actualisation_groups.h5"
            data_dir = CONF.get('paths', 'data_dir')
            filename = os.path.join(data_dir, fname)

        store = HDFStore(filename)

        # Builds openfisca variables from irpp declaration variables
        df_c = store["corresp"]
        of_vars = dict()
        for col in df_c.columns:
            of_vars[col] = list(unique(df_c[col]).dropna())

        df_a = store['amounts']
        df_b = store['benef']
        store.close()

        df_a1 = DataFrame( {'amount' : df_a[year]})

        df_a = DataFrame( columns = ['amount'] )

        for of_var, declar_vars_list in of_vars.iteritems():
            amount = 0
            for case in declar_vars_list:
                a = df_a1.get_value(case, 'amount')
                if a is not NaN:
                    amount += a
            df_a1.drop(declar_vars_list, axis = 0, inplace=True)
            row = DataFrame(dict(amount = [amount]), index = [of_var] )
            df_a = df_a.append(row)

        df_a = df_a.append(df_a1)

        self.vars_df = df_a
        self.vars_df.index.names = ['var']
        self.fill_vars()
        self.fill_coeffs()
Пример #6
0
    def _get_markup(self, para, markup_dict=None):
        """get markup """
        if not markup_dict:
            markup_dict = self._MARKUPS
            
        df = DataFrame(markup_dict, index=['Enter', 'Exit']).T
        df['In']=False
        
        sects=[]
        place=0
        while place > -1:
            place = -1
            markup = None
            estr = None
            for mark, enter in df[df.In==False].Enter.iterkv():
                find = para.find(enter)
                if find > -1 and (find<=place or place==-1):
                    if find == place and len(enter) < len(estr):
                        continue
                    place = find
                    markup = mark
                    estr = enter
            for mark, exit in df[df.In==True].Exit.iterkv():
                find = para.find(exit)
                if find > -1 and (find<=place or place==-1):
                    if find == place and len(exit) < len(estr):
                        continue
                    place = find
                    markup = mark
                    estr = exit
        
            if place > -1:
                sects.append([para[:place], df[df.In==True].index.tolist()])
                df.set_value(markup, 'In', not df.get_value(markup, 'In'))
                para = para[place+len(estr):]

        if df.In.any():
            raise ValueError(
                'the markup does not exit from;\n{}'.format(df[df.In==True]))
            
        sects.append([para, []])
                         
        return sects
#%% 
d['a']        # Series,列
#%%
d[['a','c']]  # DataFrame,列
#%%
d[:5]         # DataFrame,行
#%% 
d.ix[:5]      # position-based,行
#%%
d1.ix[:5]     # label-based,行
#%%           
d.irow(0)     # Series
#%% 
d.icol(0)     # Series
#%%
d.get_value('e','a')    # get_value(row_name,col_name)
#%% 强制使用位置来访问元素的方法
d.iget_value(0,1)       # iget_value(irow,icol)  

#%% 使用条件过滤
d[d>5]
#%% 
d[d.a>5]
#%%
d[(d>5)&(d%3==0)]

#%% 使用条件过滤的本质
d>5       # DataFrame
#%%       
d.a>5     # Series
#%% 可以自己构造一个Series
Пример #8
0
class BuildLda:
    def __init__(self, print_list=True):
        # Create dictionary
        self.dictionary = Dictionary()
        self.topics = ['Topic {}'.format(i) for i in range(1,31)]
        self.print_list = print_list

    def build_object(self):
        self.build_model()
        self.transform_set()
        self.build_nearest_neighbours()

    def build_model(self):
        if self.print_list:
            print('Building LDA')
        strings = JobDescription.objects.values('url', 'body')

        data_samples = []
        seen_strings = set()
        for string in strings:
            if string['body'] not in seen_strings:
                seen_strings.add(string['body'])
                data_samples.append({'url': string['url'], 'string': self.dictionary.clean_string(string['body'])})

        self.data_samples = DataFrame(data_samples)

        n_features = 10000
        n_topics = 15
        n_top_words = 10
        max_iter = 40

        self.tf_vectorizer = CountVectorizer(max_features=n_features,
                                        stop_words='english')

        tf = self.tf_vectorizer.fit_transform(self.data_samples['string'])

        self.lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=max_iter,
                                        learning_method='online')

        self.lda.fit(tf)

        if self.print_list:
            print()
            print("\nTopics in LDA model:")
        tf_feature_names = self.tf_vectorizer.get_feature_names()
        self.create_word_topics(self.lda, tf_feature_names)
        if self.print_list:
            self.print_top_words(self.lda, tf_feature_names, n_top_words)

    def test_single_doc(self, string):
        data_samples = DataFrame([{'string': self.dictionary.clean_string(string)}])
        test = self.tf_vectorizer.transform(data_samples['string'])
        lda_result = self.lda.transform(test)
        top_tags = []
        return_value = {'lda_result': lda_result, 'tags': []}
        index_set = sorted(range(len(lda_result[0])), key=lambda i: lda_result[0][i], reverse=True)
        position = 0
        for index in index_set:
            return_value['tags'].append({'tag': self.topics[index], 'position': position, 'score': lda_result[0][index]})
            top_tags.append(self.topics[index])
            position += 1
        return return_value

    def transform_set(self):
        if self.print_list:
            print('Getting LDA Transformation')
        vectorizor_data = self.tf_vectorizer.transform(self.data_samples['string'])
        self.results = self.lda.transform(vectorizor_data)

    def build_nearest_neighbours(self):
        if self.print_list:
            print('Build Nearest Neighbours')
        self.nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(self.results)

    def get_neighbours(self, string, print=False):
        return_result = self.test_single_doc(string)
        return_result['distances'], return_result['indices'] = self.nbrs.kneighbors(return_result['lda_result'])

        if print:
            self.print_neighbours(return_result['indices'][0])
        return_result['neighbours'] = self.return_neighbours(return_result['indices'][0], return_result['distances'][0])

        return {'tags': return_result['tags'], 'neighbours': return_result['neighbours']}

    def print_neighbours(self, indices):
        print('Closest 10 jobs:')
        for indice in indices:
            url = self.data_samples.get_value(indice, 'url')
            print('http://www.seek.com.au%s' % url)

    def return_neighbours(self, indices, distances):
        return_value = []
        for index in range(len(indices)):
            url = self.data_samples.get_value(indices[index], 'url')
            return_value.append({'url': 'http://www.seek.com.au{}'.format(url), 'distance': distances[index]})
        return return_value

    def print_top_words(self, model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(self.topics[topic_idx]+": "+" ".join([feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

    def create_word_topics(self, model, feature_names):
        for topic_idx, topic in enumerate(model.components_):
            self.topics[topic_idx] = "_".join([feature_names[i] for i in topic.argsort()[:-3 - 1:-1]])
class Reuters():
    def __init__(self,
                 data_root='./reuters21578/',
                 topic_file_name='all-topics-strings.lc.txt'):
        self._data_root = data_root
        self._topic_file_name = topic_file_name
        self._number_of_files = 22
        self._file_name_prefix = 'reut2-'
        self._create_stat_template()
        self._data = DataFrame(columns=['Id', 'Topic', 'Set', 'Body', 'TFIDF'])

    def _create_stat_template(self):
        topics_stats = []
        with open(self._data_root + self._topic_file_name, 'r') as topic_file:
            for topic in topic_file.readlines():
                topic = topic.strip()
                topics_stats.append([topic, 'TRAIN', 0])
                topics_stats.append([topic, 'TEST', 0])
                topics_stats.append([topic, 'NOT-USED', 0])
                topics_stats.append([topic, 'USABLE', 0])

        self._article_stats = DataFrame(data=topics_stats,
                                        columns=['Topic', 'Set', 'Count'])

    def _remove_tags(self, text: str):
        return re.sub('<[^<]+?>', '', text).strip()

    def _update_stats_field(self, topic: str, set_class: str):
        idx = self._article_stats[self._article_stats.Topic == topic][
            self._article_stats.Set == set_class].index[0]
        f = self._article_stats.get_value(idx, 'Count')
        self._article_stats.set_value(idx, 'Count', f + 1)

    def _update_stats(self, topic: str, set_class: str):
        self._update_stats_field(topic, set_class)
        if set_class in ['TEST', 'TRAIN']:
            self._update_stats_field(topic, 'USABLE')

    def _unescape(self, text: str):
        return saxutils.unescape(text)

    def _newslines(self):
        # for i in range(1):
        for i in range(self._number_of_files):
            file_id = '00' + str(i) if i < 10 else '0' + str(i)
            print("processing file {}".format(file_id))
            with open(
                    self._data_root + self._file_name_prefix + file_id +
                    '.sgm', 'r') as file:
                content = BeautifulSoup(file.read().lower())
                for newsline in content('reuters'):
                    yield newsline

    def _matrix_to_list(self, data: list):
        return [np.squeeze(np.asarray(item.todense())) for item in data]

    def get_news_stats(self, mode='offline') -> DataFrame:
        """
        :param mode: if offline, loads stats from disk
        returns stats of number of available news for each set
        :return datafram of stats with ['Topic', 'Set', 'Count'] as Columns
        """
        if mode == 'offline':
            self._article_stats = DataFrame.from_csv('news_stats.csv',
                                                     sep='\t')
            return self._article_stats

        for newsline in self._newslines():
            set_class = newsline.attrs['lewissplit'].upper()
            topics = newsline.topics.contents
            for topic in topics:
                topic_cleaned = self._remove_tags(str(topic)).strip()
                self._update_stats(topic=topic_cleaned, set_class=set_class)

        return self._article_stats

    def load_data(self):
        """
        Loads all the data from txt files to dataframe
        :return:
        """
        for newsline in self._newslines():
            document_id = newsline['newid']

            set_class = newsline.attrs['lewissplit'].upper()
            if set_class not in ['TRAIN', 'TEST']:
                continue

            # News text
            document_body = self._remove_tags(str(
                newsline('text')[0].text)).replace('reuter\n&#3;',
                                                   '').replace('\t', ' ')
            document_body = self._unescape(document_body.strip())

            # News topics
            topics = newsline.topics.contents
            for topic in topics:
                topic_cleaned = self._remove_tags(str(topic)).strip()
                self._data = self._data.append(
                    {
                        'Id': document_id,
                        'Topic': topic_cleaned,
                        'Set': set_class,
                        'Body': document_body,
                        'TFIDF': []
                    },
                    ignore_index=True)

    def get_all_train(self):
        """
        Returen all TRAIN data to calculate TFIDF
        :return:
        """
        return self._data[self._data.Set == 'TRAIN'].Body.values.tolist()

    def add_tfidf(self, tfidf_classifier: TFIDFClassifier):
        """
        Adds TFIDF to the dataframe to avoid multiple calculation of TFIDF
        :param tfidf_classifier: TFIDF classifier that can do word-to-tfidf conversion
        """
        document_matrix = tfidf_classifier.to_tfidf(
            self._data.Body.values.tolist())

        for index, row in enumerate(document_matrix):
            self._data.set_value(index, 'TFIDF', document_matrix[index])

    def get_data(self, topic: str, set: str):
        """
        getting vectorized TFIDF equivalent of news with belonging to specific topic and set
        :param topic: news topic. e.g. acq
        :param set: TRAIN or TEST
        :return: list of TFIDF in csr_matrix form
        """
        positive_examples = self._data[self._data.Topic == topic][
            self._data.Set == set].TFIDF.values.tolist()
        negative_examples = self._data[self._data.Topic != topic][
            self._data.Set == set].TFIDF.values.tolist()
        all_examples = self._matrix_to_list(positive_examples +
                                            negative_examples)

        labels = [0] * (len(positive_examples) + len(negative_examples))
        labels[0:len(positive_examples)] = [1] * len(positive_examples)
        return all_examples, labels
Пример #10
0
class DataFrameModel(QAbstractTableModel):
    ''' data model for a DataFrame class '''
    def __init__(self):
        super(DataFrameModel, self).__init__()
        self.df = DataFrame()

    def setDataFrame(self, dataFrame):
        self.df = dataFrame

    def signalUpdate(self):
        ''' tell viewers to update their data (this is full update, not
        efficient)'''
        self.layoutChanged.emit()

    #------------- table display functions -----------------
    def headerData(self, section, orientation, role=Qt.DisplayRole):
        if role != Qt.DisplayRole:
            return QVariant()

        if orientation == Qt.Horizontal:
            try:
                return self.df.columns.tolist()[section]
            except (IndexError, ):
                return QVariant()
        elif orientation == Qt.Vertical:
            try:
                # return self.df.index.tolist()
                return self.df.index.tolist()[section]
            except (IndexError, ):
                return QVariant()

    def data(self, index, role=Qt.DisplayRole):

        if role != Qt.DisplayRole:
            return QVariant()

        if not index.isValid():
            return QVariant()

        # gde 2014.02.19 - original implementation only worked
        # if there were no missing indices.  Instead use get_value

        #return QVariant(str(self.df.ix[index.row(), index.column()]))
        row = self.df.index[index.row()]
        col = self.df.columns[index.column()]
        str_value = str(self.df.get_value(row, col))
        return QVariant(str_value)

    def flags(self, index):
        flags = super(DataFrameModel, self).flags(index)
        flags |= Qt.ItemIsEditable
        return flags

    def setData(self, index, value, role):
        row = self.df.index[index.row()]
        col = self.df.columns[index.column()]
        if hasattr(value, 'toPyObject'):
            # PyQt4 gets a QVariant
            value = value.toPyObject()
        else:
            # PySide gets an unicode
            dtype = self.df[col].dtype
            if dtype != object:
                value = None if value == '' else dtype.type(value)
        self.df.set_value(row, col, value)
        return True

    def rowCount(self, index=QModelIndex()):
        return self.df.shape[0]

    def columnCount(self, index=QModelIndex()):
        return self.df.shape[1]
Пример #11
0
class Scores(AnnotationMixin, object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(
        cls, df,
        uri=None, modality=None, aggfunc=np.mean
    ):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        A = cls(uri=uri, modality=modality)
        A._df = pivot_table(
            df, values=PYANNOTE_SCORE,
            rows=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], cols=PYANNOTE_LABEL,
            aggfunc=aggfunc
        )
        return A

    def __init__(self, uri=None, modality=None):
        super(Scores, self).__init__()

        index = MultiIndex(
            levels=[[], []], labels=[[], []],
            names=[PYANNOTE_SEGMENT, PYANNOTE_TRACK]
        )

        self._df = DataFrame(index=index, dtype=np.float64)
        self.modality = modality
        self.uri = uri
        self._timelineHasChanged = True

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self._df = self._df.drop(segment, axis=0)
            self._timelineHasChanged = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self._df = self._df.drop((segment, track), axis=0)
            self._timelineHasChanged = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):
        segment, track, label = key
        return self._df.get_value((segment, track), label)

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return {l: self._df.get_value((segment, track), l) for l in self._df}

    # scores[segment, track, label] = value
    def __setitem__(self, key, value):
        segment, track, label = key
        self._df = self._df.set_value((segment, track), label, value)
        self._timelineHasChanged = True

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        labels = sorted(self._df.columns, key=str)
        if unknown:
            return labels
        else:
            return [l for l in labels if not isinstance(l, Unknown)]

    def itervalues(self):
        """Iterate over annotation as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._df = self._df.sort_index()

        # yield one (segment, track, label) tuple per loop
        labels = self._df.columns
        for (segment, track), columns in self._df.iterrows():
            for label in labels:
                value = columns[label]
                if np.isnan(value):
                    continue
                else:
                    yield segment, track, label, value

    def _rank(self, invert):

        if invert:
            direction = 1.

        else:
            direction = -1.

        def nan_rank(data):

            # replace NaN by -inf or +inf depending on the requested direction
            finite = np.isfinite(data)
            fixed = np.where(finite, direction*data, -direction*np.inf)

            # do the actual argsort
            indices = np.argsort(fixed)
            # get rank from argsort
            rank = np.argsort(indices)

            # special treatment for inverted NaN scores
            # (we want ranks to start at 0 even in case of NaN)
            if invert:
                rank = np.where(finite, rank-(len(data)-np.sum(finite)), np.nan)
            else:
                rank = np.where(finite, rank, np.nan)
            return rank

        return self._df.apply(nan_rank, axis=1)

    def rank(self, invert=False):
        """

        Parameters
        ----------
        invert : bool, optional
            By default, larger scores are better.
            Set `invert` to True to indicate smaller scores are better.

        Returns
        -------
        rank : `Scores`

        """
        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = self._rank(invert)
        return A

    def nbest(self, n, invert=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        invert : bool, optional
            By default, larger scores are better.
            Set `invert` to True to indicate smaller scores are better.

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """
        df = self._df.copy()
        nbest = self._rank(invert) < n
        df[~nbest] = np.nan

        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = df

        return A

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = self._df[list(labels)]

        return A

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        annotation = Annotation(uri=self.uri, modality=self.modality)
        if not self:
            return annotation

        best = self.nbest(1, invert=False)

        if posterior:

            # compute unknown posterior
            func = lambda p: 1. - np.nansum(p, axis=1)
            Pu = self.apply(func, new_columns=['_'])

            # threshold best target posterior
            # with unknown posterior and threshold
            for segment, track, label, value in best.itervalues():

                if value < Pu[segment, track, '_'] or value < threshold:
                    label = Unknown()

                annotation[segment, track] = label

        else:

            # threshold best target score with threshold
            for segment, track, label, value in best.itervalues():
                if value < threshold:
                    label = Unknown()
                annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""
        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = func(self._df)
        return A

    def apply(self, data_func, new_index=None, new_columns=None):
        """Apply `data_func` on internal numpy array

        Parameters
        ----------
        data_func : func
            Function expecting (index x columns) numpy array as input
        new_index : iterable, optional
            When provided, these will be the index of returned array.
        new_columns : iterable, optional
            When provided, these will be the columns of returned array.
        """
        new_data = data_func(self._df.values)

        if new_index is None:
            new_index = self._df.index

        if new_columns is None:
            new_columns = self._df.columns

        df = DataFrame(
            data=new_data,
            index=new_index,
            columns=new_columns)

        new_scores = self.__class__(uri=self.uri, modality=self.modality)
        new_scores._df = df

        return new_scores

    def _repr_png_(self):
        from pyannote.core.notebook import repr_scores
        return repr_scores(self)
Пример #12
0
class Aggregates(object):
    filter_by = None
    labels = collections.OrderedDict((
        ('var', u"Mesure"),
        ('entity', u"Entité"),
        ('dep', u"Dépenses\n(millions d'€)"),
        ('benef', u"Bénéficiaires\n(milliers)"),
        ('dep_default', u"Dépenses initiales\n(millions d'€)"),
        ('benef_default', u"Bénéficiaires\ninitiaux\n(milliers)"),
        ('dep_real', u"Dépenses\nréelles\n(millions d'€)"),
        ('benef_real', u"Bénéficiaires\nréels\n(milliers)"),
        ('dep_diff_abs', u"Diff. absolue\nDépenses\n(millions d'€)"),
        ('benef_diff_abs', u"Diff absolue\nBénéficiaires\n(milliers)"),
        ('dep_diff_rel', u"Diff. relative\nDépenses"),
        ('benef_diff_rel', u"Diff. relative\nBénéficiaires"),
    ))  # TODO: localize
    show_default = False
    show_diff = True
    show_real = True
    survey_scenario = None
    totals_df = None
    varlist = None

    def __init__(self, survey_scenario=None):
        if survey_scenario is not None:
            self.set_survey_scenario(survey_scenario)

    def clear(self):
        self.totals_df = None

    def compute(self):
        """
        Compute the whole table
        """
        self.compute_aggregates(self.filter_by)
        self.load_amounts_from_file()
        self.compute_real()
        self.compute_diff()

    def compute_aggregates(self, filter_by=None):
        """
        Compute aggregate amounts
        """
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        V = []
        M = {'data': [], 'default': []}
        B = {'data': [], 'default': []}
        U = []

        M_label = {
            'data': self.labels['dep'],
            'default': self.labels['dep_default']
        }
        B_label = {
            'data': self.labels['benef'],
            'default': self.labels['benef_default']
        }

        for var in self.varlist:
            # amounts and beneficiaries from current data and default data if exists
            montant_benef = self.get_aggregate(var, filter_by)
            V.append(column_by_name[var].label)
            entity = column_by_name[var].entity_key_plural

            U.append(entity)
            for dataname in montant_benef:
                M[dataname].append(montant_benef[dataname][0])
                B[dataname].append(montant_benef[dataname][1])

        # build items list
        items = [(self.labels['var'], V)]

        for dataname in M:
            if M[dataname]:
                items.append((M_label[dataname], M[dataname]))
                items.append((B_label[dataname], B[dataname]))

        items.append((self.labels['entity'], U))
        aggr_frame = DataFrame.from_items(items)

        self.aggr_frame = None
        for code, label in self.labels.iteritems():
            try:
                col = aggr_frame[label]
                if self.aggr_frame is None:
                    self.aggr_frame = DataFrame(col)
                else:
                    self.aggr_frame = self.aggr_frame.join(col, how="outer")
            except:
                pass

    def compute_diff(self):
        '''
        Computes and adds relative differences
        '''

        dep = self.aggr_frame[self.labels['dep']]
        benef = self.aggr_frame[self.labels['benef']]

        if self.show_default:
            ref_dep_label, ref_benef_label = self.labels[
                'dep_default'], self.labels['benef_default']
            if ref_dep_label not in self.aggr_frame:
                return
        elif self.show_real:
            ref_dep_label, ref_benef_label = self.labels[
                'dep_real'], self.labels['benef_real']
        else:
            return

        ref_dep = self.aggr_frame[ref_dep_label]
        ref_benef = self.aggr_frame[ref_benef_label]

        self.aggr_frame[self.labels['dep_diff_rel']] = (dep -
                                                        ref_dep) / abs(ref_dep)
        self.aggr_frame[self.labels['benef_diff_rel']] = (
            benef - ref_benef) / abs(ref_benef)
        self.aggr_frame[self.labels['dep_diff_abs']] = dep - ref_dep
        self.aggr_frame[self.labels['benef_diff_abs']] = benef - ref_benef

    def compute_real(self):
        '''
        Adds administrative data to dataframe
        '''
        if self.totals_df is None:
            return
        A, B = [], []
        for var in self.varlist:
            # totals from administrative data
            if var in self.totals_df.index:
                A.append(self.totals_df.get_value(var, "amount"))
                B.append(self.totals_df.get_value(var, "benef"))
            else:
                A.append(nan)
                B.append(nan)
        self.aggr_frame[self.labels['dep_real']] = A
        self.aggr_frame[self.labels['benef_real']] = B

    def create_description(self):
        '''
        Creates a description dataframe
        '''
        now = datetime.now()
        return DataFrame([
            u'OpenFisca',
            u'Calculé le %s à %s' %
            (now.strftime('%d-%m-%Y'), now.strftime('%H:%M')),
            u'Système socio-fiscal au %s' % self.simulation.period.start,
            u"Données d'enquêtes de l'année %s" %
            str(self.simulation.input_table.survey_year),
        ])

    def get_aggregate(self, variable, filter_by=None):
        """
        Returns aggregate spending, and number of beneficiaries
        for the relevant entity level

        Parameters
        ----------
        variable : string
                   name of the variable aggregated according to its entity
        """
        simulation = self.simulation
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        column = column_by_name[variable]
        weight_name = self.weight_column_name_by_entity_key_plural[
            column.entity_key_plural]
        filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
        # amounts and beneficiaries from current data and default data if exists
        # Build weights for each entity
        data = DataFrame({
            variable: simulation.calculate(variable),
            weight_name: simulation.calculate(weight_name),
        })
        data_default = None

        datasets = {'data': data}
        if data_default is not None:
            datasets['default'] = data_default
        filter_indicator = True
        if filter_by:
            filtered_data = DataFrame({
                variable:
                simulation.calculate(variable),
                weight_name:
                simulation.calculate(weight_name),
                filter_by_name:
                simulation.calculate(filter_by_name),
            })
            data_default = None
            filter_indicator = filtered_data[filter_by_name]
        m_b = {}

        weight = data[weight_name] * filter_indicator
        for name, data in datasets.iteritems():
            amount = data[variable]
            benef = data[variable].values != 0
            try:
                total_amount = int(round(sum(amount * weight) / 10**6))
            except:
                total_amount = nan
            try:
                total_benef = int(round(sum(benef * weight) / 10**3))
            except:
                total_benef = nan

            m_b[name] = [total_amount, total_benef]

        return m_b

    def load_amounts_from_file(self, filename=None, year=None):
        '''
        Loads totals from files
        '''
        if year is None:
            year = self.year
        if filename is None:
            data_dir = DATA_DIR

        try:
            filename = os.path.join(data_dir, "amounts.h5")
            store = HDFStore(filename)

            df_a = store['amounts']
            df_b = store['benef']
            store.close()
            self.totals_df = DataFrame(data={
                "amount": df_a[year] / 10**6,
                "benef": df_b[year] / 1000,
            })
            row = DataFrame({'amount': nan, 'benef': nan}, index=['logt'])
            self.totals_df = self.totals_df.append(row)

            # Add some aditionnals totals
            for col in ['amount', 'benef']:
                # Deals with logt
                logt = 0
                for var in ['apl', 'alf', 'als']:
                    logt += self.totals_df.get_value(var, col)
                self.totals_df.set_value('logt', col, logt)

                # Deals with rsa rmi
                rsa = 0
                for var in ['rmi', 'rsa']:
                    rsa += self.totals_df.get_value(var, col)
                self.totals_df.set_value('rsa', col, rsa)

                # Deals with irpp, csg, crds
                for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']:
                    if col in ['amount']:
                        val = -self.totals_df.get_value(var, col)
                        self.totals_df.set_value(var, col, val)
        except:
            #  raise Exception(" No administrative data available for year " + str(year))
            import warnings
            warnings.warn(
                "No administrative data available for year %s in file %s" %
                (str(year), filename))
            self.totals_df = None
            return

    def save_table(self, directory=None, filename=None, table_format=None):
        '''
        Saves the table to some format
        '''
        now = datetime.now()
        if table_format is None:
            if filename is not None:
                extension = filename[-4:]
                if extension == '.xls':
                    table_format = 'xls'
                elif extension == '.csv':
                    table_format = 'csv'
            else:
                table_format = 'xls'

        if directory is None:
            directory = "."
        if filename is None:
            filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'),
                                             table_format)

        fname = os.path.join(directory, filename)

        try:
            df = self.aggr_frame
            if table_format == "xls":
                writer = ExcelWriter(str(fname))
                df.to_excel(writer, "aggregates", index=False, header=True)
                descr = self.create_description()
                descr.to_excel(writer,
                               "description",
                               index=False,
                               header=False)
                writer.save()
            elif table_format == "csv":
                df.to_csv(fname, "aggregates", index=False, header=True)
        except Exception, e:
            raise Exception("Aggregates: Error saving file", str(e))
Пример #13
0
import pandas as pd
from pandas import DataFrame
import numpy as np

# load the datasets
names = ['dealerId', 'gender']
df = DataFrame(pd.read_csv("./GenderPythonTest.csv", sep=',', names=names))

# Probablistic network build-up
bayesNetwork = [[], [], []]
counter = -1
for idx in df['dealerId'].index:
    featureName = df.get_value(idx, 'dealerId')
    if featureName not in bayesNetwork[0]:
        if counter != -1 and bayesNetwork[1][counter] + bayesNetwork[2][
                counter] == 0:
            #extreme case, the data is fault, hence overwrite the previous feature
            bayesNetwork[0][counter] = featureName
        else:
            bayesNetwork[0].append(featureName)
            bayesNetwork[1].append(0)
            bayesNetwork[2].append(0)
            counter += 1
    gender = df.get_value(idx, 'gender')
    if gender == 'male':
        bayesNetwork[1][counter] += 1
    elif gender == 'female':
        bayesNetwork[2][counter] += 1

# For visualization
viz_data = {
Пример #14
0
class Instancia:
    '''
    Classe que implementa uma instancia do cenário
    em que se resolve o problema de otimização
    de rotas veiculares
    '''
    def __init__(self, conf, name_file):
        '''
        Recebe o nome do arquivo de instância para inicializar a classe
        '''

        instancia_path = path.join(conf['instancia_path'], name_file)
        with open(instancia_path, 'r') as file:
            d = loads(file.read())
            file.close()
        self.instancia_path = instancia_path
        self.__requests = d['requests']
        self.__static_data = d['static_data']
        self.__service_time = d['static_data']['service_time']
        self.__df_total = DataFrame(self.__requests)
        self.n = len(self.__requests)
        self.m = d['static_data']['number_of_vehicles']
        self.Q = d['static_data']['max_vehicle_capacity']
        self.T = d['static_data']['total_time']
        self.u_mean = d['static_data']['urgency_mean']
        self.u_std = d['static_data']['urgency_std']
        self.dynamism = d['static_data']['dynamism']
        self.dp_ratio = d['static_data']['dp_ratio']
        self.priori_ratio = d['static_data']['priori_ratio']

        self.deposito_x = 0
        self.deposito_y = 0

    def __get_distance(self, fonte, antro):
        '''
        Função genérica de distância entre 2 pontos
            a partir dos dados de um DataFrame
        '''
        x1 = self.__df_total.get_value(fonte - 1, "service_point_x")
        y1 = self.__df_total.get_value(fonte - 1, "service_point_y")
        x2 = self.__df_total.get_value(antro - 1, "service_point_x")
        y2 = self.__df_total.get_value(antro - 1, "service_point_y")
        return round(sqrt((x1 - x2)**2 + (y1 - y2)**2), 2)

    def __get_distance_deposito(self, local):
        '''
        Função genérica de distância entre 1 ponto
            e o local do depósito
            a partir dos dados de um DataFrame
        '''
        x1 = self.__df_total.get_value(local - 1, "service_point_x")
        y1 = self.__df_total.get_value(local - 1, "service_point_y")
        x2 = self.deposito_x
        y2 = self.deposito_y
        return round(sqrt((x1 - x2)**2 + (y1 - y2)**2), 2)

    def __get_base(self, initial=None, offset=False, item=None):
        '''
        Função genérica para captar dados simples do json
        e formar um dicionário
        '''
        if not initial:
            initial = {}
        for i, r in enumerate(self.__requests):
            if offset and type(offset) == int:
                index = i + 1 + offset
            else:
                index = i + 1
            if type(item) == int:
                initial[index] = item
            elif type(item) == str:
                initial[index] = r[item]
            else:
                pass
        return initial

    '''
    Todas as funções do tipo 'get_x(self)'
        retornam os dados estáticos do problema.
    Substituindo 'x' por as seguintes opções, temos:
        q -> Embarque/Desembarque de passageiros por local
        s -> Tempo de embarque/desembarque de passageiros no local
        t -> Instante desejado do atendimento do pedido nas origens
        W -> Tempo máximo de espera pelo atendimento nas origens
        R -> Tempo máximo de viagem no atendimento em cada origem
        O -> Todas as origens de pedidos
        D -> Todos os destinos de pedidos
        V -> Todos os vértices do grafo criado
        K -> Todos os veículos
    '''

    def get_q(self):
        d = {0: 0, 2 * self.n + 1: 0}
        d = self.__get_base(d, offset=False, item=1)
        return self.__get_base(d, offset=self.n, item=-1)

    def get_s(self):
        d = {0: 0, 2 * self.n + 1: 0}
        d = self.__get_base(d, offset=False, item=self.__service_time)
        return self.__get_base(d, offset=self.n, item=self.__service_time)

    def get_t(self):
        return self.__get_base(offset=False, item="desired_time")

    def get_W(self):
        return self.__get_base(offset=False, item="max_wait_time")

    def get_R(self):
        return self.__get_base(offset=False, item="max_ride_time")

    def get_O(self):
        return [o + 1 for o in range(self.n)]

    def get_D(self):
        return [o + self.n + 1 for o in range(self.n)]

    def get_V(self):
        return [0] + self.get_O() + self.get_D() + [2 * self.n + 1]

    def get_K(self):
        return range(self.m)

    def get_T(self):
        return self.T

    def get_urgency(self):
        return self.u_mean, self.u_std

    def get_dynamism(self):
        return self.dynamism

    def get_dp_ratio(self):
        return self.dp_ratio

    def get_priori_ratio(self):
        return self.priori_ratio

    def get_req(self):
        return self.__requests

    def get_static_data(self):
        return self.__static_data

    def get_tau(self):
        '''
        Retorna uma estrutura de dados do tipo dicionário, onde:
            Index, do tipo tupla (a,b), representa o arco entre 'a' e 'b'
            Value, do tipo float c, representa o tempo de viagem no arco
        '''
        # Tempo de viagem do veículo que fica parado é nulo
        tau = {(0, 2 * self.n + 1): 0}

        # Dataframes utilizados para encontrar pedidos
        #   e diferenciar entre drops e picks
        df_total = DataFrame(self.__requests)
        df_drops = df_total.loc[df_total["service_type"] == "drop"]
        df_picks = df_total.loc[df_total["service_type"] == "pick"]
        id_drops = [x + 1 for x in list(df_drops.index.values)]
        id_picks = [x + 1 for x in list(df_picks.index.values)]
        pedidos = list(df_total.index.values)

        # Um grafo é um dicionário de listas
        graph = defaultdict(list)

        def addEdge(g, u, v):
            g[u].append(v)

        # Para todos os pedidos
        for pedido in pedidos:
            origem_pedido, destino_pedido = pedido + 1, pedido + 1 + self.n

            # Adiciona arcos do depósito para a origem
            #   do destino para o depósito
            #   e entre a origem e o destino
            addEdge(graph, 0, origem_pedido)
            addEdge(graph, destino_pedido, 2 * self.n + 1)
            addEdge(graph, origem_pedido, destino_pedido)
            addEdge(graph, destino_pedido, origem_pedido)

            # Para todos os outros pedidos
            for outro_pedido in pedidos:
                # (se for o mesmo, ignora)
                if pedido == outro_pedido:
                    pass
                else:
                    origem_outro_pedido, destino_outro_pedido = outro_pedido + 1, outro_pedido + 1 + self.n

                    # Adiciona arcos entre as origens
                    #   entre os destinos
                    #   entre a origem do primeiro e o destino do segundo
                    #   e entre o destino do primeiro e a origem do segundo
                    addEdge(graph, origem_pedido, origem_outro_pedido)
                    addEdge(graph, destino_pedido, destino_outro_pedido)
                    addEdge(graph, origem_pedido, destino_outro_pedido)
                    addEdge(graph, destino_pedido, origem_outro_pedido)

        def genEdge(graph):
            edges = []
            for node in graph:
                for neighbour in graph[node]:
                    edges.append((node, neighbour))
            return edges

        # Arcos são tuplas entre nós, nomeados 'fonte' e 'antro'
        for arco in genEdge(graph):
            fonte = arco[0]
            antro = arco[1]

            fonte_depo = False
            antro_depo = False

            if (fonte == 0) or (fonte > self.n and fonte - self.n
                                in id_picks) or (fonte <= self.n
                                                 and fonte in id_drops):
                fonte_depo = True
            if antro == 2 * self.n + 1 or (antro > self.n and antro - self.n
                                           in id_picks) or (antro <= self.n and
                                                            antro in id_drops):
                antro_depo = True

            if fonte > self.n:
                fonte = fonte - self.n
            if antro > self.n:
                antro = antro - self.n

            if (fonte_depo, antro_depo) == (True, True):
                tau[arco] = 0
            elif (fonte_depo, antro_depo) == (True, False):
                tau[arco] = self.__get_distance_deposito(antro)
            elif (fonte_depo, antro_depo) == (False, True):
                tau[arco] = self.__get_distance_deposito(fonte)
            else:
                tau[arco] = self.__get_distance(fonte, antro)
        return tau

    def get_pos_requests(self):
        '''
        Retorna uma lista com tuplas que representam os pedidos,
        incluindo os seguintes dados ordenados:
            id do pedido (int)
            posição x do pedido (float)
            posição y do pedido (float)
            tipo do pedido (str)
        Método usado na criação de imagems que apresentam os
        resultados obtidos
        '''
        data = []
        columns = [
            "service_point_x", "service_point_y", "service_type",
            "desired_time"
        ]
        for i, r in enumerate(list(self.__df_total[columns].values)):
            id_pedido = i + 1
            x, y, t, d = float(r[0]), float(r[1]), str(r[2]), int(r[3])
            data.append((id_pedido, x, y, t, d))
        return data
Пример #15
0
    def fit(self, annotations):
        """

        Parameters
        ----------
        annotations : (Annotation, Annotation) iterator

        Returns
        -------


        """

        # possible_match[n, m] is the total possible match duration
        # when there are n A-tracks & m B-tracks
        possible_match = DataFrame()

        # actual_match[n, m] is the total actual match duration
        # when there are n A-tracks & m B-tracks
        actual_match = DataFrame()

        # overlap[n, m] is the total duration
        # when there are n A-tracks & m B-tracks
        overlap = DataFrame()

        for n, (A, B) in enumerate(annotations):

            assert isinstance(A, Annotation), "%r is not an Annotation" % A
            assert isinstance(B, Annotation), "%r is not an Annotation" % B
            if n == 0:
                self.modalityA = A.modality
                self.modalityB = B.modality
            else:
                assert A.modality == self.modalityA, \
                    "bad modality (%r, %r)" % (self.modalityA, A.modality)
                assert B.modality == self.modalityB, \
                    "bad modality (%r, %r)" % (self.modalityB, B.modality)
            assert A.uri == B.uri, \
                "resource mismatch (%r, %r)" % (A.uri, B.uri)

            timeline, a, b = self._AB2ab(A, B)

            for segment in timeline:

                duration = segment.duration

                # number of tracks
                atracks = a.tracks(segment)
                Na = len(atracks)
                btracks = b.tracks(segment)
                Nb = len(btracks)

                if Na == 0 or Nb == 0:
                    continue

                # number of matching tracks
                N = len(a.get_labels(segment) & b.get_labels(segment))

                # increment possible_match & actual_match
                try:
                    p_m = possible_match.get_value(Na, Nb)
                    a_m = actual_match.get_value(Na, Nb)
                    ovl = overlap.get_value(Na, Nb)
                except Exception, e:
                    p_m = 0.
                    a_m = 0.
                    ovl = 0.

                possible_match = possible_match.set_value(Na, Nb,
                                                          p_m + min(Na, Nb)*duration)
                actual_match = actual_match.set_value(Na, Nb,
                                                      a_m + N*duration)
                overlap = overlap.set_value(Na, Nb, ovl + duration)
Пример #16
0
for i in range(row):
    for j in range(col):
        myList = matrix[i][j]
        #print myList

        list_iter = 0
        current = datetime.combine(start_day, start_time)

        while current < end:
            no_crimes = 0
            day_end = current + delta

            while list_iter < len(myList) and myList[list_iter] < day_end:
                no_crimes += 1
                list_iter += 1
            temp = df.get_value(current.date(), 'freq') + no_crimes
            df.set_value(current.date(), 'freq', temp)
            day_matrix[i][j].append(no_crimes)

            current += delta
        count_matrix[i][j] = len(myList)
        total += len(myList)
#print count_matrix

#print(total)

#print day_matrix[2][2]
"""
for i in range(row):
	for j in range(col):
		print str(int(math.floor(count_matrix[i][j]))),
Пример #17
0
    def cal_portfolio_returns(self, cal_period = 1, cal_type = 'momentum', index_type = '300', return_type = 'period', cal_percent = [0.01]):
        #判定指数类型
        if index_type == '300':
            df_index = self.df_index_300
            index_prices = self.index_prices_300
        elif index_type == '500':
            df_index = self.df_index_500
            index_prices = self.index_prices_500
        elif index_type == '800':
            df_index = self.df_index_800
            index_prices = self.index_prices_800

        #修改指数信息的列的数据类型
        index_levels = pd.to_datetime(df_index.columns.get_level_values(0).unique())
        df_index.columns.set_levels(index_levels, level=0, inplace=True)

        # 计算组合股票数量
        cal_num = [int(float(index_type) * x) for x in cal_percent]

        #获取成分股信息
        df_code = df_index.xs(key='code', axis=1, level=1)

        # 计算各股票收益率
        stock_returns = self.df_price.pct_change(cal_period)
        # 计算指数收益率
        index_total_returns = index_prices.pct_change(cal_period)
        period_list = stock_returns.index
        # 计算策略组合收益情况
        portfolio_returns = DataFrame()
        index_cum_returns = Series()
        for i in range(cal_period, len(period_list) - cal_period, cal_period):
            # 计算收益时间点指数成分股的收益率
            cal_date = period_list[i]
            index_returns = stock_returns[df_code[cal_date]].ix[cal_date, :]
            # 对指数成分股的收益率进行排序
            if cal_type == 'momentum':
                index_returns.sort_values(ascending=False, inplace=True)
            elif cal_type == 'crossovers':
                index_returns.sort_values(ascending=True, inplace=True)

            for num in cal_num:
                # 选取上一期收益高(低)的成分股组合
                portfolio = index_returns[0:num].index
                # 计算组合平均收益率
                return_date = period_list[i + cal_period]
                portfolio_return = Series.mean(stock_returns[portfolio].ix[return_date, :])
                # 计算分期超额收益率
                if return_type == 'period':
                    excess_return = portfolio_return - index_total_returns[return_date]
                # 计算累计超额收益率
                elif return_type == 'cum':
                    if i > cal_period:
                        index_return = index_cum_returns[cal_date] * (1 + index_total_returns[return_date])
                        cum_return = portfolio_returns.get_value(index=cal_date, col=num / float(index_type))
                        portfolio_return = (1 + portfolio_return) * (index_cum_returns[cal_date] + cum_return)
                    else:
                        index_return = 1 + index_total_returns[return_date]
                        portfolio_return = 1 + portfolio_return

                    excess_return = portfolio_return - index_return
                    index_cum_returns.set_value(label=return_date, value=index_return)

                portfolio_returns.set_value(index=return_date, col=num / float(index_type), value=excess_return)
        portfolio_returns.columns=[str(x*100) + '%' for x in cal_percent]
        self.portfolio_returns=portfolio_returns
class Aggregates(object):
    filter_by = None
    labels = collections.OrderedDict((
        ('var', u"Mesure"),
        ('entity', u"Entité"),
        ('dep', u"Dépenses\n(millions d'€)"),
        ('benef', u"Bénéficiaires\n(milliers)"),
        ('dep_default', u"Dépenses initiales\n(millions d'€)"),
        ('benef_default', u"Bénéficiaires\ninitiaux\n(milliers)"),
        ('dep_real', u"Dépenses\nréelles\n(millions d'€)"),
        ('benef_real', u"Bénéficiaires\nréels\n(milliers)"),
        ('dep_diff_abs', u"Diff. absolue\nDépenses\n(millions d'€)"),
        ('benef_diff_abs', u"Diff absolue\nBénéficiaires\n(milliers)"),
        ('dep_diff_rel', u"Diff. relative\nDépenses"),
        ('benef_diff_rel', u"Diff. relative\nBénéficiaires"),
        ))  # TODO: localize
    show_default = False
    show_diff = True
    show_real = True
    survey_scenario = None
    totals_df = None
    varlist = None

    def __init__(self, survey_scenario = None):
        if survey_scenario is not None:
            self.set_survey_scenario(survey_scenario)

    def clear(self):
        self.totals_df = None

    def compute(self):
        """
        Compute the whole table
        """
        self.compute_aggregates(self.filter_by)
        self.load_amounts_from_file()
        self.compute_real()
        self.compute_diff()

    def compute_aggregates(self, filter_by = None):
        """
        Compute aggregate amounts
        """
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        V = []
        M = {'data': [], 'default': []}
        B = {'data': [], 'default': []}
        U = []

        M_label = {'data': self.labels['dep'],
                   'default': self.labels['dep_default']}
        B_label = {'data': self.labels['benef'],
                   'default': self.labels['benef_default']}

        for var in self.varlist:
            # amounts and beneficiaries from current data and default data if exists
            montant_benef = self.get_aggregate(var, filter_by)
            V.append(column_by_name[var].label)
            entity = column_by_name[var].entity_key_plural

            U.append(entity)
            for dataname in montant_benef:
                M[dataname].append(montant_benef[dataname][0])
                B[dataname].append(montant_benef[dataname][1])

        # build items list
        items = [(self.labels['var'], V)]

        for dataname in M:
            if M[dataname]:
                items.append((M_label[dataname], M[dataname]))
                items.append((B_label[dataname], B[dataname]))

        items.append((self.labels['entity'], U))
        aggr_frame = DataFrame.from_items(items)

        self.aggr_frame = None
        for code, label in self.labels.iteritems():
            try:
                col = aggr_frame[label]
                if self.aggr_frame is None:
                    self.aggr_frame = DataFrame(col)
                else:
                    self.aggr_frame = self.aggr_frame.join(col, how="outer")
            except:
                pass

    def compute_diff(self):
        '''
        Computes and adds relative differences
        '''

        dep = self.aggr_frame[self.labels['dep']]
        benef = self.aggr_frame[self.labels['benef']]

        if self.show_default:
            ref_dep_label, ref_benef_label = self.labels['dep_default'], self.labels['benef_default']
            if ref_dep_label not in self.aggr_frame:
                return
        elif self.show_real:
            ref_dep_label, ref_benef_label = self.labels['dep_real'], self.labels['benef_real']
        else:
            return

        ref_dep = self.aggr_frame[ref_dep_label]
        ref_benef = self.aggr_frame[ref_benef_label]

        self.aggr_frame[self.labels['dep_diff_rel']] = (dep - ref_dep) / abs(ref_dep)
        self.aggr_frame[self.labels['benef_diff_rel']] = (benef - ref_benef) / abs(ref_benef)
        self.aggr_frame[self.labels['dep_diff_abs']] = dep - ref_dep
        self.aggr_frame[self.labels['benef_diff_abs']] = benef - ref_benef

    def compute_real(self):
        '''
        Adds administrative data to dataframe
        '''
        if self.totals_df is None:
            return
        A, B = [], []
        for var in self.varlist:
            # totals from administrative data
            if var in self.totals_df.index:
                A.append(self.totals_df.get_value(var, "amount"))
                B.append(self.totals_df.get_value(var, "benef"))
            else:
                A.append(nan)
                B.append(nan)
        self.aggr_frame[self.labels['dep_real']] = A
        self.aggr_frame[self.labels['benef_real']] = B

    def create_description(self):
        '''
        Creates a description dataframe
        '''
        now = datetime.now()
        return DataFrame([
            u'OpenFisca',
            u'Calculé le %s à %s' % (now.strftime('%d-%m-%Y'), now.strftime('%H:%M')),
            u'Système socio-fiscal au %s' % self.simulation.period.start,
            u"Données d'enquêtes de l'année %s" % str(self.simulation.input_table.survey_year),
            ])

    def get_aggregate(self, variable, filter_by = None):
        """
        Returns aggregate spending, and number of beneficiaries
        for the relevant entity level

        Parameters
        ----------
        variable : string
                   name of the variable aggregated according to its entity
        """
        simulation = self.simulation
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        column = column_by_name[variable]
        weight_name = self.weight_column_name_by_entity_key_plural[column.entity_key_plural]
        filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
        # amounts and beneficiaries from current data and default data if exists
        # Build weights for each entity
        data = DataFrame(
            {
                variable: simulation.calculate_add(variable),
                weight_name: simulation.calculate(weight_name),
                }
            )
        data_default = None

        datasets = {'data': data}
        if data_default is not None:
            datasets['default'] = data_default
        filter_indicator = True
        if filter_by:
            filtered_data = DataFrame(
                {
                    variable: simulation.calculate(variable),
                    weight_name: simulation.calculate(weight_name),
                    filter_by_name: simulation.calculate(filter_by_name),
                    }
                )
            data_default = None
            filter_indicator = filtered_data[filter_by_name]
        m_b = {}

        weight = data[weight_name] * filter_indicator
        for name, data in datasets.iteritems():
            amount = data[variable]
            benef = data[variable].values != 0
            try:
                total_amount = int(round(sum(amount * weight) / 10 ** 6))
            except:
                total_amount = nan
            try:
                total_benef = int(round(sum(benef * weight) / 10 ** 3))
            except:
                total_benef = nan

            m_b[name] = [total_amount, total_benef]

        return m_b

    def load_amounts_from_file(self, filename = None, year = None):
        '''
        Loads totals from files
        '''
        if year is None:
            year = self.year
        if filename is None:
            data_dir = DATA_DIR

        try:
            filename = os.path.join(data_dir, "amounts.h5")
            store = HDFStore(filename)

            df_a = store['amounts']
            df_b = store['benef']
            store.close()
            self.totals_df = DataFrame(data = {
                "amount": df_a[year] / 10 ** 6,
                "benef": df_b[year] / 1000,
                })
            row = DataFrame({'amount': nan, 'benef': nan}, index = ['logt'])
            self.totals_df = self.totals_df.append(row)

            # Add some aditionnals totals
            for col in ['amount', 'benef']:
                # Deals with logt
                logt = 0
                for var in ['apl', 'alf', 'als']:
                    logt += self.totals_df.get_value(var, col)
                self.totals_df.set_value('logt', col, logt)

                # Deals with rsa rmi
                rsa = 0
                for var in ['rmi', 'rsa']:
                    rsa += self.totals_df.get_value(var, col)
                self.totals_df.set_value('rsa', col, rsa)

                # Deals with irpp, csg, crds
                for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']:
                    if col in ['amount']:
                        val = - self.totals_df.get_value(var, col)
                        self.totals_df.set_value(var, col, val)
        except:
            #  raise Exception(" No administrative data available for year " + str(year))
            import warnings
            warnings.warn("No administrative data available for year %s in file %s" % (str(year), filename))
            self.totals_df = None
            return

    def save_table(self, directory = None, filename = None, table_format = None):
        '''
        Saves the table to some format
        '''
        now = datetime.now()
        if table_format is None:
            if filename is not None:
                extension = filename[-4:]
                if extension == '.xls':
                    table_format = 'xls'
                elif extension == '.csv':
                    table_format = 'csv'
            else:
                table_format = 'xls'

        if directory is None:
            directory = "."
        if filename is None:
            filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format)

        fname = os.path.join(directory, filename)

        try:
            df = self.aggr_frame
            if table_format == "xls":
                writer = ExcelWriter(str(fname))
                df.to_excel(writer, "aggregates", index= False, header= True)
                descr = self.create_description()
                descr.to_excel(writer, "description", index = False, header=False)
                writer.save()
            elif table_format == "csv":
                df.to_csv(fname, "aggregates", index= False, header = True)
        except Exception, e:
                raise Exception("Aggregates: Error saving file", str(e))