Пример #1
0
def data_viewer(request, city='东莞'):
    # 数据显示
    job_info = JobField.objects
    keywords = job_info.distinct("key_word")

    kd_salary = {kd: get_average_salary(job_info, kd, city)
                 for kd in keywords if get_average_salary(job_info, kd, city)}

    kd_salary = Series(kd_salary)
    kd_salary = kd_salary.sort_values()[::-1][:25]
    # 为了减少运算量
    # top_keyword = list(kd_salary.index)

    frame = Series(kd_salary)
    series = {index: frame[index] for index in frame.index}
    series = Series(series)

    items = get_city_ratio(job_info)

    job_count_rank = dict(get_job_count(job_info, keywords))
    job_count_rank = Series(job_count_rank)

    context = {
        'cities': items[:20],
        'series': series.sort_values()[::-1][:25],
        'keyword_dict': kd_salary,
        'top_job_counts': job_count_rank.sort_values()[::-1][:25],
        'city': city,
    }
    return render(request, 'data_viewer.html', context)
def evaluate_k_recall(k, y_test, y_proba_list):
    count = 0
    for i in range(len(y_test)):
        s = Series(y_proba_list[i])
        s.sort_values(inplace=True)
        pre_k = s.index.values[-1 * k:]
        if y_test[i] in pre_k:
            count += 1
    print float(count) / (len(y_test))
Пример #3
0
def get_timeframe(data: Dict[str, DataFrame]) -> Tuple[arrow.Arrow, arrow.Arrow]:
    """
    Get the maximum timeframe for the given backtest data
    :param data: dictionary with preprocessed backtesting data
    :return: tuple containing min_date, max_date
    """
    all_dates = Series([])
    for pair, pair_data in data.items():
        all_dates = all_dates.append(pair_data['date'])
    all_dates.sort_values(inplace=True)
    return arrow.get(all_dates.iloc[0]), arrow.get(all_dates.iloc[-1])
Пример #4
0
def choose_feature(dataset):
    features = dataset.columns[:-1]
    base_ent = calc_shannon_ent(dataset['choice'].value_counts())
    ent_gain = Series(index=features)
    for feature in features:
        new_ent = 0.0
        feat_list = dataset[feature].value_counts()
        prob = feat_list / float(sum(feat_list))
        for feat in feat_list.index:
            tmp_ent = calc_shannon_ent(dataset.ix[dataset[feature] == feat]['choice'].value_counts())       
            new_ent -= prob[feat] * tmp_ent
        ent_gain[feature] = base_ent - new_ent
    ent_gain.sort_values(inplace=True)
    return ent_gain.index[0]
Пример #5
0
    def test_sort_values(self):

        # check indexes are reordered corresponding with the values
        ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D'])
        expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C'])
        result = ser.sort_values()
        self.assert_series_equal(expected, result)

        ts = self.ts.copy()
        ts[:5] = np.NaN
        vals = ts.values

        result = ts.sort_values()
        self.assertTrue(np.isnan(result[-5:]).all())
        self.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:]))

        # na_position
        result = ts.sort_values(na_position='first')
        self.assertTrue(np.isnan(result[:5]).all())
        self.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:]))

        # something object-type
        ser = Series(['A', 'B'], [1, 2])
        # no failure
        ser.sort_values()

        # ascending=False
        ordered = ts.sort_values(ascending=False)
        expected = np.sort(ts.valid().values)[::-1]
        assert_almost_equal(expected, ordered.valid().values)
        ordered = ts.sort_values(ascending=False, na_position='first')
        assert_almost_equal(expected, ordered.valid().values)

        # inplace=True
        ts = self.ts.copy()
        ts.sort_values(ascending=False, inplace=True)
        self.assert_series_equal(ts, self.ts.sort_values(ascending=False))
        self.assert_index_equal(ts.index,
                                self.ts.sort_values(ascending=False).index)

        # GH 5856/5853
        # Series.sort_values operating on a view
        df = DataFrame(np.random.randn(10, 4))
        s = df.iloc[:, 0]

        def f():
            s.sort_values(inplace=True)

        self.assertRaises(ValueError, f)
Пример #6
0
    def test_series(self, orient, numpy):
        s = Series([10, 20, 30, 40, 50, 60], name="series",
                   index=[6, 7, 8, 9, 10, 15]).sort_values()

        encode_kwargs = {} if orient is None else dict(orient=orient)
        decode_kwargs = {} if numpy is None else dict(numpy=numpy)

        output = ujson.decode(ujson.encode(s, **encode_kwargs),
                              **decode_kwargs)

        if orient == "split":
            dec = _clean_dict(output)
            output = Series(**dec)
        else:
            output = Series(output)

        if orient in (None, "index"):
            s.name = None
            output = output.sort_values()
            s.index = ["6", "7", "8", "9", "10", "15"]
        elif orient in ("records", "values"):
            s.name = None
            s.index = [0, 1, 2, 3, 4, 5]

        tm.assert_series_equal(output, s, check_dtype=False)
Пример #7
0
def get_recommend(username, data=data, pearson_result=pearson_result):
    pearson_user = pearson_result[username].drop(username).index
    current_user_index = data[username].dropna().index
    result = Series()
    for user in pearson_user:
        no_index = data[user].dropna().index.difference(
            current_user_index)
        for i in no_index:
            rating = data[user][i] * pearson_result[username][user]
            if result.get(i, None) and (result[i] > rating):
                result[i] = rating
            else:
                result[i] = rating
    return result.sort_values(ascending=False)
def draw(digits_chars):
    """
    绘制柱装图
    :param digits_chars:
    """
    fig = plt.figure()
    y = digits_chars
    x = np.arange(len(y))
    x_label_digit = [i for i in range(10)]
    x_label_char = [chr(i) for i in range(97,123)]
    x_label_char.insert(0, '-')
    x_label = x_label_digit + x_label_char
    series_data = Series(y,x_label)
    fig.add_subplot(121)
    plt.plot(x, series_data.values, 'k-o', label=u"字符频率")
    plt.legend(prop={'size':11})
    x_min,x_max = x.min(), x.max()
    plt.xlim(x_min-1,x_max+1)
    plt.xticks(x,series_data.index)  # x坐标显示内容
    # plt.grid()
    plt.xlabel(u"字符")
    plt.ylabel(u"所占比例(%)")

    fig.add_subplot(122)
    series_data = series_data.sort_values(ascending=False)
    plt.bar(x,series_data.values,label=u'字符频率', align='center')
    plt.legend(prop={'size':11})
    x_min,x_max = x.min(), x.max()
    plt.xlim(x_min-1,x_max+1)
    plt.xticks(x,series_data.index)  # x坐标显示内容
    # plt.grid()
    plt.xlabel(u"字符")
    plt.ylabel(u"所占比例(%)")

    plt.subplots_adjust(top=0.96, bottom=0.09, left=0.06, right=0.96)
    # plt.savefig(u'字符频率.png')

    plt.show()
Пример #9
0
 def pandas_sum(to_process: pd.Series) -> int:
     # Sort the values before computing the sum.
     # For details please go to
     #   https://github.com/NVIDIA/spark-rapids/issues/740#issuecomment-784917512
     return to_process.sort_values().sum()
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)

## 5. Reindexing ##

original_index = series_custom.index.tolist()
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)

## 6. Sorting ##


sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
print(sc2[0:10])
print(sc3[0:10])

## 7. Vectorized operations ##


series_normalized = (series_custom/20)

## 8. Comparing and filtering ##


both_criteria = series_custom[criteria_one & criteria_two]

## 9. Alignment ##
Пример #11
0
    def test_sort_values(self):

        # check indexes are reordered corresponding with the values
        ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D'])
        expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C'])
        result = ser.sort_values()
        tm.assert_series_equal(expected, result)

        ts = self.ts.copy()
        ts[:5] = np.NaN
        vals = ts.values

        result = ts.sort_values()
        assert np.isnan(result[-5:]).all()
        tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:]))

        # na_position
        result = ts.sort_values(na_position='first')
        assert np.isnan(result[:5]).all()
        tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:]))

        # something object-type
        ser = Series(['A', 'B'], [1, 2])
        # no failure
        ser.sort_values()

        # ascending=False
        ordered = ts.sort_values(ascending=False)
        expected = np.sort(ts.dropna().values)[::-1]
        assert_almost_equal(expected, ordered.dropna().values)
        ordered = ts.sort_values(ascending=False, na_position='first')
        assert_almost_equal(expected, ordered.dropna().values)

        # ascending=[False] should behave the same as ascending=False
        ordered = ts.sort_values(ascending=[False])
        expected = ts.sort_values(ascending=False)
        assert_series_equal(expected, ordered)
        ordered = ts.sort_values(ascending=[False], na_position='first')
        expected = ts.sort_values(ascending=False, na_position='first')
        assert_series_equal(expected, ordered)

        msg = "ascending must be boolean"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending=None)
        msg = r"Length of ascending \(0\) must be 1 for Series"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending=[])
        msg = r"Length of ascending \(3\) must be 1 for Series"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending=[1, 2, 3])
        msg = r"Length of ascending \(2\) must be 1 for Series"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending=[False, False])
        msg = "ascending must be boolean"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending='foobar')

        # inplace=True
        ts = self.ts.copy()
        ts.sort_values(ascending=False, inplace=True)
        tm.assert_series_equal(ts, self.ts.sort_values(ascending=False))
        tm.assert_index_equal(ts.index,
                              self.ts.sort_values(ascending=False).index)

        # GH 5856/5853
        # Series.sort_values operating on a view
        df = DataFrame(np.random.randn(10, 4))
        s = df.iloc[:, 0]

        msg = ("This Series is a view of some other array, to sort in-place"
               " you must create a copy")
        with pytest.raises(ValueError, match=msg):
            s.sort_values(inplace=True)
Пример #12
0
'''
sort and order
'''
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
print(obj.sort_index())

frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(frame.sort_index())
print(frame.sort_index(axis=1))
print(frame.sort_index(axis=1, ascending=False))

obj = Series([4, 7, -3, -2])
print(obj.sort_values())
obj = Series([4, np.nan, 7, np.nan, -3, -2])
print(obj.sort_values())

frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a', 'b']))

# rank
obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
print(obj.rank(method='first'))
print(obj.rank(method='max', ascending=False))

frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
Пример #13
0
def interrogator(corpus, 
    search='w', 
    query='any',
    show='w',
    exclude=False,
    excludemode='any',
    searchmode='all',
    case_sensitive=False,
    save=False,
    subcorpora=False,
    just_metadata=False,
    skip_metadata=False,
    preserve_case=False,
    lemmatag=False,
    files_as_subcorpora=False,
    only_unique=False,
    only_format_match=True,
    multiprocess=False,
    spelling=False,
    regex_nonword_filter=r'[A-Za-z0-9]',
    gramsize=1,
    conc=False,
    maxconc=9999,
    window=None,
    no_closed=False,
    no_punct=True,
    discard=False,
    **kwargs):
    """
    Interrogate corpus, corpora, subcorpus and file objects.
    See corpkit.interrogation.interrogate() for docstring
    """
    
    conc = kwargs.get('do_concordancing', conc)
    quiet = kwargs.get('quiet', False)
    coref = kwargs.pop('coref', False)
    show_conc_metadata = kwargs.pop('show_conc_metadata', False)
    fsi_index = kwargs.pop('fsi_index', True)
    dep_type = kwargs.pop('dep_type', 'collapsed-ccprocessed-dependencies')

    nosubmode = subcorpora is None
    #todo: temporary
    #if getattr(corpus, '_dlist', False):
    #    subcorpora = 'file'

    # store kwargs and locs
    locs = locals().copy()
    locs.update(kwargs)
    locs.pop('kwargs', None)

    import codecs
    import signal
    import os
    from time import localtime, strftime
    from collections import Counter

    import pandas as pd
    from pandas import DataFrame, Series

    from corpkit.interrogation import Interrogation, Interrodict
    from corpkit.corpus import Datalist, Corpora, Corpus, File, Subcorpus
    from corpkit.process import (tregex_engine, get_deps, unsplitter, sanitise_dict, 
                                 animator, filtermaker, fix_search,
                                 pat_format, auto_usecols, format_tregex,
                                 make_conc_lines_from_whole_mid)
    from corpkit.other import as_regex
    from corpkit.dictionaries.process_types import Wordlist
    from corpkit.build import check_jdk
    from corpkit.conll import pipeline
    from corpkit.process import delete_files_and_subcorpora
    
    have_java = check_jdk()

    # remake corpus without bad files and folders 
    corpus, skip_metadata, just_metadata = delete_files_and_subcorpora(corpus, skip_metadata, just_metadata)

    # so you can do corpus.interrogate('features/postags/wordclasses/lexicon')
    if search == 'features':
        search = 'v'
        query = 'any'
    if search in ['postags', 'wordclasses']:
        query = 'any'
        preserve_case = True
        show = 'p' if search == 'postags' else 'x'
        # use tregex if simple because it's faster
        # but use dependencies otherwise
        search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'}
    if search == 'lexicon':
        search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'}
        query = 'any'
        show = ['w']

    if not kwargs.get('cql') and isinstance(search, STRINGTYPE) and len(search) > 3:
        raise ValueError('search argument not recognised.')

    import re
    if regex_nonword_filter:
        is_a_word = re.compile(regex_nonword_filter)
    else:
        is_a_word = re.compile(r'.*')

    from traitlets import TraitError

    # convert cql-style queries---pop for the sake of multiprocessing
    cql = kwargs.pop('cql', None)
    if cql:
        from corpkit.cql import to_corpkit
        search, exclude = to_corpkit(search)

    def signal_handler(signal, _):
        """
        Allow pausing and restarting whn not in GUI
        """
        if root:
            return  
        import signal
        import sys
        from time import localtime, strftime
        signal.signal(signal.SIGINT, original_sigint)
        thetime = strftime("%H:%M:%S", localtime())
        INPUTFUNC('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
        time = strftime("%H:%M:%S", localtime())
        print('%s: Interrogation resumed.\n' % time)
        signal.signal(signal.SIGINT, signal_handler)

    def add_adj_for_ngram(show, gramsize):
        """
        If there's a gramsize of more than 1, remake show
        for ngramming
        """
        if gramsize == 1:
            return show
        out = []
        for i in show:
            out.append(i)
        for i in range(1, gramsize):
            for bit in show:
                out.append('+%d%s' % (i, bit))
        return out

    def fix_show_bit(show_bit):
        """
        Take a single search/show_bit type, return match
        """
        ends = ['w', 'l', 'i', 'n', 'f', 'p', 'x', 's', 'a', 'e', 'c']
        starts = ['d', 'g', 'm', 'b', 'h', '+', '-', 'r', 'c']
        show_bit = show_bit.lstrip('n')
        show_bit = show_bit.lstrip('b')
        show_bit = list(show_bit)
        if show_bit[-1] not in ends:
            show_bit.append('w')
        if show_bit[0] not in starts:
            show_bit.insert(0, 'm')
        return ''.join(show_bit)

    def fix_show(show, gramsize):
        """
        Lowercase anything in show and turn into list
        """
        if isinstance(show, list):
            show = [i.lower() for i in show]
        elif isinstance(show, STRINGTYPE):
            show = show.lower()
            show = [show]
        show = [fix_show_bit(i) for i in show]
        return add_adj_for_ngram(show, gramsize)

    def is_multiquery(corpus, search, query, outname):
        """
        Determine if multiprocessing is needed/possibe, and 
        do some retyping if need be as well
        """
        is_mul = False
        from collections import OrderedDict
        from corpkit.dictionaries.process_types import Wordlist
        
        if isinstance(query, Wordlist):
            query = list(query)

        if subcorpora and multiprocess:
            is_mul = 'subcorpora'

        if isinstance(subcorpora, (list, tuple)):
            is_mul = 'subcorpora'

        if isinstance(query, (dict, OrderedDict)):
            is_mul = 'namedqueriessingle'
        
        if isinstance(search, dict):
            if all(isinstance(i, dict) for i in list(search.values())):
                is_mul = 'namedqueriesmultiple'
        return is_mul, corpus, search, query

    def ispunct(s):
        import string
        return all(c in string.punctuation for c in s)

    def uniquify(conc_lines):
        """get unique concordance lines"""
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (_, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def compiler(pattern):
        """
        Compile regex or fail gracefully
        """
        if hasattr(pattern, 'pattern'):
            return pattern
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value, exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def determine_search_func(show):
        """Figure out what search function we're using"""

        simple_tregex_mode = False
        statsmode = False
        tree_to_text = False
        search_trees = False
            
        simp_crit = all(not i for i in [kwargs.get('tgrep'),
                                        files_as_subcorpora,
                                        subcorpora,
                                        just_metadata,
                                        skip_metadata])

        if search.get('t') and simp_crit:
            if have_java:
                simple_tregex_mode = True
            else:
                search_trees = 'tgrep'
            optiontext = 'Searching parse trees'

        elif datatype == 'conll':
        
            if any(i.endswith('t') for i in search.keys()):
                if have_java and not kwargs.get('tgrep'):
                    search_trees = 'tregex'
                else:
                    search_trees = 'tgrep'
                optiontext = 'Searching parse trees'
            elif any(i.endswith('v') for i in search.keys()):
                # either of these searchers now seems to work
                #seacher = get_stats_conll
                statsmode = True
                optiontext = 'General statistics'
            elif any(i.endswith('r') for i in search.keys()):
                optiontext = 'Distance from root'
            else:
                optiontext = 'Querying CONLL data'

        return optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees

    def get_tregex_values(show):
        """If using Tregex, set appropriate values

        - Check for valid query
        - Make 'any' query
        - Make list query
        """

        translated_option = 't'
        if isinstance(search['t'], Wordlist):
            search['t'] = list(search['t'])
        q = tregex_engine(corpus=False,
                          query=search.get('t'),
                          options=['-t'],
                          check_query=True,
                          root=root,
                          preserve_case=preserve_case
                         )

        # so many of these bad fixing loops!
        nshow = []
        for i in show:
            if i == 'm':
                nshow.append('w')
            else:
                nshow.append(i.lstrip('m'))
        show = nshow

        if q is False:
            if root:
                return 'Bad query', None
            else:
                return 'Bad query', None

        if isinstance(search['t'], list):
            regex = as_regex(search['t'], boundaries='line', case_sensitive=case_sensitive)
        else:
            regex = ''

        # listquery, anyquery, translated_option
        treg_dict = {'p': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     'pl': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     'x': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     't': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'o'],
                     'w': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'],
                     'c': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 'C'],
                     'l': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'],
                     'u': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 'v']
                    }

        newshow = []

        listq, anyq, translated_option = treg_dict.get(show[0][-1].lower())
        newshow.append(translated_option)
        for item in show[1:]:
            _, _, noption = treg_dict.get(item.lower())
            newshow.append(noption)

        if isinstance(search['t'], list):
            search['t'] = listq
        elif search['t'] == 'any':   
            search['t'] = anyq
        return search['t'], newshow

    def correct_spelling(a_string):
        """correct spelling within a string"""
        if not spelling:
            return a_string
        from corpkit.dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def make_search_iterable(corpus):
        """determine how to structure the corpus for interrogation"""
        # skip file definitions if they are not needed
        if getattr(corpus, '_dlist', False):

            return {(i.name, i.path): [i] for i in list(corpus.files)}
            #return {('Sample', 'Sample'): list(corpus.files)}

        if simple_tregex_mode:
            if corpus.level in ['s', 'f', 'd']:
                return {(corpus.name, corpus.path): False}
            else:
                return {(os.path.basename(i), os.path.join(corpus.path, i)): False
                    for i in os.listdir(corpus.path)
                    if os.path.isdir(os.path.join(corpus.path, i))}

        if isinstance(corpus, Datalist):
            to_iterate_over = {}
            # it could be files or subcorpus objects
            if corpus[0].level in ['s', 'd']:
                if files_as_subcorpora:
                    for subc in corpus:
                        for f in subc.files:
                            to_iterate_over[(f.name, f.path)] = [f]
                else:
                    for subc in corpus:
                        to_iterate_over[(subc.name, subc.path)] = subc.files
            elif corpus[0].level == 'f':
                for f in corpus:
                    to_iterate_over[(f.name, f.path)] = [f]
        elif corpus.singlefile:
            to_iterate_over = {(corpus.name, corpus.path): [corpus]}
        elif not hasattr(corpus, 'subcorpora') or not corpus.subcorpora:
            # just files in a directory
            if files_as_subcorpora:
                to_iterate_over = {}
                for f in corpus.files:
                    to_iterate_over[(f.name, f.path)] = [f]
            else:
                to_iterate_over = {(corpus.name, corpus.path): corpus.files}
        else:
            to_iterate_over = {}
            if files_as_subcorpora:
                # don't know if possible: has subcorpora but also .files
                if hasattr(corpus, 'files') and corpus.files is not None:
                    for f in corpus.files:
                        to_iterate_over[(f.name, f.path)] = [f]
                # has subcorpora with files in those
                elif hasattr(corpus, 'files') and corpus.files is None:
                    for subc in corpus.subcorpora:
                        for f in subc.files:
                            to_iterate_over[(f.name, f.path)] = [f]
            else:
                if corpus[0].level == 's':
                    for subcorpus in corpus:
                        to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
                elif corpus[0].level == 'f':
                    for f in corpus:
                        to_iterate_over[(f.name, f.path)] = [f]
                else:
                    for subcorpus in corpus.subcorpora:
                        to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        return to_iterate_over

    def welcome_printer(return_it=False):
        """Print welcome message"""
        if no_conc:
            message = 'Interrogating'
        else:
            message = 'Interrogating and concordancing'
        if only_conc:
            message = 'Concordancing'
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            from corpkit.process import dictformat
            sformat = dictformat(search)
            welcome = ('\n%s: %s %s ...\n          %s\n          ' \
                        'Query: %s\n          %s corpus ... \n' % \
                      (thetime, message, cname, optiontext, sformat, message))
            if return_it:
                return welcome
            else:
                print(welcome)

    def goodbye_printer(return_it=False, only_conc=False):
        """Say goodbye before exiting"""
        if not kwargs.get('printstatus', True):
            return
        thetime = strftime("%H:%M:%S", localtime())
        if only_conc:
            finalstring = '\n\n%s: Concordancing finished! %s results.' % (thetime, format(len(conc_df), ','))
        else:
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %s matches.' % format(tot, ',')
            else:
                finalstring += ' %s unique results, %s total occurrences.' % (format(numentries, ','), format(total_total, ','))
        if return_it:
            return finalstring
        else:
            print(finalstring)

    def get_conc_colnames(corpus,
                          fsi_index=False,
                          simple_tregex_mode=False):
    
        fields = []
        base = 'c f s l m r'
        
        if simple_tregex_mode:
            base = base.replace('f ', '')

        if fsi_index and not simple_tregex_mode:
            base = 'i ' + base
        
        if PYTHON_VERSION == 2:
            base = base.encode('utf-8').split()
        else:
            base = base.split() 

        if show_conc_metadata:
            from corpkit.build import get_all_metadata_fields
            meta = get_all_metadata_fields(corpus.path)

            if isinstance(show_conc_metadata, list):
                meta = [i for i in meta if i in show_conc_metadata]
            #elif show_conc_metadata is True:
            #    pass
            for i in sorted(meta):
                if i in ['speaker', 'sent_id', 'parse']:
                    continue
                if PYTHON_VERSION == 2:
                    base.append(i.encode('utf-8'))
                else:
                    base.append(i)
        return base

    def make_conc_obj_from_conclines(conc_results, fsi_index=False):
        """
        Turn conclines into DataFrame
        """
        from corpkit.interrogation import Concordance
        #fsi_place = 2 if fsi_index else 0

        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            for lin in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                #if not subcorpora:
                #    lin[fsi_place] = lin[fsi_place]
                #lin.insert(fsi_place, sc_name)

                if len(lin) < len(conc_col_names):
                    diff = len(conc_col_names) - len(lin)
                    lin.extend(['none'] * diff)

                all_conc_lines.append(Series(lin, index=conc_col_names))

        try:
            conc_df = pd.concat(all_conc_lines, axis=1).T
        except ValueError:
            return
        
        if all(x == '' for x in list(conc_df['s'].values)) or \
           all(x == 'none' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis=1, inplace=True)

        locs['corpus'] = corpus.name

        if maxconc:
            conc_df = Concordance(conc_df[:maxconc])
        else:
            conc_df = Concordance(conc_df)
        try:
            conc_df.query = locs
        except AttributeError:
            pass
        return conc_df

    def lowercase_result(res):
        """      
        Take any result and do spelling/lowercasing if need be

        todo: remove lowercase and change name
        """
        if not res or statsmode:
            return res
        # this is likely broken, but spelling in interrogate is deprecated anyway
        if spelling:
            res = [correct_spelling(r) for r in res]
        return res

    def postprocess_concline(line, fsi_index=False, conc=False):
        # todo: are these right?
        if not conc:
            return line
        subc, star, en = 0, 2, 5
        if fsi_index:
            subc, star, en = 2, 4, 7
        if not preserve_case:
            line[star:en] = [str(x).lower() for x in line[star:en]]
        if spelling:
            line[star:en] = [correct_spelling(str(b)) for b in line[star:en]]
        return line

    def make_progress_bar():
        """generate a progress bar"""

        if simple_tregex_mode:
            total_files = len(list(to_iterate_over.keys()))
        else:
            total_files = sum(len(x) for x in list(to_iterate_over.values()))

        par_args = {'printstatus': kwargs.get('printstatus', True),
                    'root': root, 
                    'note': note,
                    'quiet': quiet,
                    'length': total_files,
                    'startnum': kwargs.get('startnum'),
                    'denom': kwargs.get('denominator', 1)}

        term = None
        if kwargs.get('paralleling', None) is not None:
            from blessings import Terminal
            term = Terminal()
            par_args['terminal'] = term
            par_args['linenum'] = kwargs.get('paralleling')

        if in_notebook:
            par_args['welcome_message'] = welcome_message

        outn = kwargs.get('outname', '')
        if outn:
            outn = getattr(outn, 'name', outn)
            outn = outn + ': '

        tstr = '%s%d/%d' % (outn, current_iter, total_files)
        p = animator(None, None, init=True, tot_string=tstr, **par_args)
        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
        animator(p, current_iter, tstr, **par_args)
        return p, outn, total_files, par_args

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')
    language_model = kwargs.get('language_model')

    # set up pause method
    original_sigint = signal.getsignal(signal.SIGINT)
    if kwargs.get('paralleling', None) is None:
        if not root:
            original_sigint = signal.getsignal(signal.SIGINT)
            signal.signal(signal.SIGINT, signal_handler)

    # find out about concordancing
    only_conc = False
    no_conc = False
    if conc is False:
        no_conc = True
    if isinstance(conc, str) and conc.lower() == 'only':
        only_conc = True
        no_conc = False
    numconc = 0

    # wipe non essential class attributes to not bloat query attrib
    if isinstance(corpus, Corpus):
        import copy
        corpus = copy.copy(corpus)
        for k, v in corpus.__dict__.items():
            if isinstance(v, (Interrogation, Interrodict)):
                corpus.__dict__.pop(k, None)

    # convert path to corpus object
    if not isinstance(corpus, (Corpus, Corpora, Subcorpus, File, Datalist)):
        if not multiprocess and not kwargs.get('outname'):
            corpus = Corpus(corpus, print_info=False)

    # figure out how the user has entered the query and show, and normalise
    from corpkit.process import searchfixer
    search = searchfixer(search, query)
    show = fix_show(show, gramsize)
    locs['show'] = show

    # instantiate lemmatiser if need be
    lem_instance = False
    if any(i.endswith('l') for i in show) and isinstance(search, dict) and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lem_instance = WordNetLemmatizer()

    # do multiprocessing if need be
    im, corpus, search, query, = is_multiquery(corpus, search, query, 
                                                             kwargs.get('outname', False))

    # figure out if we can multiprocess the corpus
    if hasattr(corpus, '__iter__') and im:
        corpus = Corpus(corpus, print_info=False)
    if hasattr(corpus, '__iter__') and not im:
        im = 'datalist'
    if isinstance(corpus, Corpora):
        im = 'multiplecorpora'

    # split corpus if the user wants multiprocessing but no other iterable
    if not im and multiprocess:
        im = 'datalist'
        if getattr(corpus, 'subcorpora', False):
            corpus = corpus[:]
        else:
            corpus = corpus.files

    search = fix_search(search, case_sensitive=case_sensitive, root=root)
    exclude = fix_search(exclude, case_sensitive=case_sensitive, root=root)

    # if it's already been through pmultiquery, don't do it again
    locs['search'] = search
    locs['exclude'] = exclude
    locs['query'] = query
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess
    locs['print_info'] = kwargs.get('printstatus', True)
    locs['multiple'] = im
    locs['subcorpora'] = subcorpora
    locs['nosubmode'] = nosubmode

    # send to multiprocess function
    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from corpkit.multiprocess import pmultiquery
        return pmultiquery(**locs)

    # get corpus metadata
    cname = corpus.name
    if isinstance(save, STRINGTYPE):
        savename = corpus.name + '-' + save
    if save is True:
        raise ValueError('save must be str, not bool.')


    datatype = getattr(corpus, 'datatype', 'conll')
    singlefile = getattr(corpus, 'singlefile', False)
    level = getattr(corpus, 'level', 'c')
        
    # store all results in here
    from collections import defaultdict
    results = defaultdict(Counter)
    count_results = defaultdict(list)
    conc_results = defaultdict(list)

    # check if just counting, turn off conc if so
    countmode = 'c' in show or 'mc' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    # Determine the search function to be used #
    optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees = determine_search_func(show)
    
    # no conc for statsmode
    if statsmode:
        no_conc = True
        only_conc = False
        conc = False

    # Set some Tregex-related values
    translated_option = False
    if search.get('t'):
        query, translated_option = get_tregex_values(show)
        if query == 'Bad query' and translated_option is None:
            if root:
                return 'Bad query'
            else:
                return
    # more tregex options
    if tree_to_text:
        treg_q = r'ROOT << __'
        op = ['-o', '-t', '-w', '-f']
    elif simple_tregex_mode:
        treg_q = search['t']
        op = ['-%s' % i for i in translated_option] + ['-o', '-f']

    # make iterable object for corpus interrogation
    to_iterate_over = make_search_iterable(corpus)

    try:
        from ipywidgets import IntProgress
        _ = IntProgress(min=0, max=10, value=1)
        in_notebook = True
    except TraitError:
        in_notebook = False
    except ImportError:
        in_notebook = False
    # caused in newest ipython
    except AttributeError:
        in_notebook = False

    lemtag = False
    if search.get('t'):
        from corpkit.process import gettag
        lemtag = gettag(search.get('t'), lemmatag)

    usecols = auto_usecols(search, exclude, show, kwargs.pop('usecols', None), coref=coref)

    # print welcome message
    welcome_message = welcome_printer(return_it=in_notebook)

    # create a progress bar
    p, outn, total_files, par_args = make_progress_bar()

    if conc:
        conc_col_names = get_conc_colnames(corpus,
                                           fsi_index=fsi_index,
                                           simple_tregex_mode=False)

 

    # Iterate over data, doing interrogations
    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):
        if nosubmode:
            subcorpus_name = 'Total'

        # results for subcorpus go here
        #conc_results[subcorpus_name] = []
        #count_results[subcorpus_name] = []
        #results[subcorpus_name] = Counter()

        # get either everything (tree_to_text) or the search['t'] query
        if tree_to_text or simple_tregex_mode:
            result = tregex_engine(query=treg_q,
                                   options=op,
                                   corpus=subcorpus_path,
                                   root=root,
                                   preserve_case=preserve_case)

            # format search results with slashes etc
            if not countmode and not tree_to_text:
                result = format_tregex(result, show, translated_option=translated_option,
                            exclude=exclude, excludemode=excludemode, lemtag=lemtag,
                            lem_instance=lem_instance, countmode=countmode, speaker_data=False)

            # if concordancing, do the query again with 'whole' sent and fname
            if not no_conc:
                ops = ['-w'] + op
                #ops = [i for i in ops if i != '-n']
                whole_result = tregex_engine(query=search['t'],
                                             options=ops,
                                             corpus=subcorpus_path,
                                             root=root,
                                             preserve_case=preserve_case
                                            )

                # format match too depending on option
                if not only_format_match:
                    wholeresult = format_tregex(whole_result, show, translated_option=translated_option,
                                exclude=exclude, excludemode=excludemode, lemtag=lemtag,
                            lem_instance=lem_instance, countmode=countmode, speaker_data=False, whole=True)

                # make conc lines from conc results
                conc_result = make_conc_lines_from_whole_mid(whole_result, result, show=show)
                for lin in conc_result:
                    if maxconc is False or numconc < maxconc:
                        conc_results[subcorpus_name].append(lin)
                    numconc += 1

            # add matches to ongoing counts
            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                if result:
                    results[subcorpus_name] += Counter([i[-1] for i in result])
                else:
                    results[subcorpus_name] += Counter()

            # update progress bar
            current_iter += 1
            tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)
            continue

        # todo: move this
        kwargs.pop('by_metadata', None)
        
        # conll querying goes by file, not subcorpus
        for f in files:
            slow_treg_speaker_guess = kwargs.get('outname', '') if kwargs.get('multispeaker') else ''
            filepath, corefs = f.path, coref
            res, conc_res = pipeline(filepath, search=search, show=show,
                                     dep_type=dep_type,
                                     exclude=exclude,
                                     excludemode=excludemode,
                                     searchmode=searchmode,
                                     case_sensitive=case_sensitive,
                                     conc=conc,
                                     only_format_match=only_format_match,
                                     speaker=slow_treg_speaker_guess,
                                     gramsize=gramsize,
                                     no_punct=no_punct,
                                     no_closed=no_closed,
                                     window=window,
                                     filename=f.path,
                                     coref=corefs,
                                     countmode=countmode,
                                     maxconc=(maxconc, numconc),
                                     is_a_word=is_a_word,
                                     by_metadata=subcorpora,
                                     show_conc_metadata=show_conc_metadata,
                                     just_metadata=just_metadata,
                                     skip_metadata=skip_metadata,
                                     fsi_index=fsi_index,
                                     category=subcorpus_name,
                                     translated_option=translated_option,
                                     statsmode=statsmode,
                                     preserve_case=preserve_case,
                                     usecols=usecols,
                                     search_trees=search_trees,
                                     lem_instance=lem_instance,
                                     lemtag=lemtag,
                                     **kwargs)

            if res is None and conc_res is None:
                current_iter += 1
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                animator(p, current_iter, tstr, **par_args)
                continue

            # deal with symbolic structures---that is, rather than adding
            # results by subcorpora, add them by metadata value
            # todo: sorting?
            if subcorpora:
                for (k, v), concl in zip(res.items(), conc_res.values()):                            
                    v = lowercase_result(v)
                    results[k] += Counter(v)
                    for line in concl:
                        if maxconc is False or numconc < maxconc:
                            line = postprocess_concline(line,
                                fsi_index=fsi_index, conc=conc)
                            conc_results[k].append(line)
                            numconc += 1
                
                current_iter += 1
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                animator(p, current_iter, tstr, **par_args)
                continue

            # garbage collection needed?
            sents = None
            corefs = None
                
            if res == 'Bad query':
                return 'Bad query'

            if countmode:
                count_results[subcorpus_name] += [res]

            else:
                # add filename and do lowercasing for conc
                if not no_conc:
                    for line in conc_res:
                        line = postprocess_concline(line,
                            fsi_index=fsi_index, conc=conc)
                        if maxconc is False or numconc < maxconc:
                            conc_results[subcorpus_name].append(line)
                            numconc += 1

                # do lowercasing and spelling
                if not only_conc:
                    res = lowercase_result(res)
                    # discard removes low results, helping with 
                    # curse of dimensionality
                    countres = Counter(res)
                    if isinstance(discard, float):
                        countres.most_common()
                        nkeep = len(counter) - len(counter) * discard
                        countres = Counter({k: v for i, (k, v) in enumerate(countres.most_common()) if i <= nkeep})
                    elif isinstance(discard, int):
                        countres = Counter({k: v for k, v in countres.most_common() if v >= discard})
                    results[subcorpus_name] += countres
                    #else:
                    #results[subcorpus_name] += res

            # update progress bar
            current_iter += 1
            tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

    # Get concordances into DataFrame, return if just conc
    if not no_conc:
        # fail on this line with typeerror if no results?
        conc_df = make_conc_obj_from_conclines(conc_results, fsi_index=fsi_index)
        if only_conc and conc_df is None:
            return
        elif only_conc:
            locs = sanitise_dict(locs)
            try:
                conc_df.query = locs
            except AttributeError:
                return conc_df
            if save and not kwargs.get('outname'):
                if conc_df is not None:
                    conc_df.save(savename)
            goodbye_printer(only_conc=True)
            if not root:
                signal.signal(signal.SIGINT, original_sigint)            
            return conc_df
    else:
        conc_df = None

    # Get interrogation into DataFrame
    if countmode:
        df = Series({k: sum(v) for k, v in sorted(count_results.items())})
        tot = df.sum()
    else:
        the_big_dict = {}
        unique_results = set(item for sublist in list(results.values()) for item in sublist)
        sortres = sorted(results.items(), key=lambda x: x[0])
        for word in unique_results:
            the_big_dict[word] = [subcorp_result[word] for _, subcorp_result in sortres]
        # turn master dict into dataframe, sorted
        df = DataFrame(the_big_dict, index=sorted(results.keys()))

        # for ngrams, remove hapaxes
        #if show_ngram or show_collocates:
        #    if not language_model:
        #        df = df[[i for i in list(df.columns) if df[i].sum() > 1]]

        numentries = len(df.columns)
        tot = df.sum(axis=1)
        total_total = df.sum().sum()

    # turn df into series if all conditions met
    conds = [countmode,
             files_as_subcorpora,
             subcorpora,
             kwargs.get('df1_always_df', False)]
    anyxs = [level == 's',
             singlefile,
             nosubmode]
    if all(not x for x in conds) and any(x for x in anyxs):
        df = Series(df.ix[0])
        df.sort_values(ascending=False, inplace=True)
        tot = df.sum()
        numentries = len(df.index)
        total_total = tot

    # turn data into DF for GUI if need be
    if isinstance(df, Series) and kwargs.get('df1_always_df', False):
        total_total = df.sum()
        df = DataFrame(df)
        tot = Series(total_total, index=['Total'])

    # if we're doing files as subcorpora,  we can remove the extension etc
    if isinstance(df, DataFrame) and files_as_subcorpora:
        cname = corpus.name.replace('-stripped', '').replace('-parsed', '')
        edits = [(r'(-[0-9][0-9][0-9])?\.txt\.conllu?', ''),
                 (r'-%s(-stripped)?(-parsed)?' % cname, '')]
        from corpkit.editor import editor
        df = editor(df, replace_subcorpus_names=edits).results
        tot = df.sum(axis=1)
        total_total = df.sum().sum()

    if conc_df is not None and conc_df is not False:
        # removed 'f' from here for now
        for col in ['c']:
            for pat in ['.txt', '.conll', '.conllu']:
                conc_df[col] = conc_df[col].str.replace(pat, '')
            conc_df[col] = conc_df[col].str.replace(r'-[0-9][0-9][0-9]$', '')

        #df.index = df.index.str.replace('w', 'this')

    # make interrogation object
    locs['corpus'] = corpus.path
    locs = sanitise_dict(locs)
    if nosubmode and isinstance(df, pd.DataFrame):
        df = df.sum()
    interro = Interrogation(results=df, totals=tot, query=locs, concordance=conc_df)

    # save it
    if save and not kwargs.get('outname'):
        print('\n')
        interro.save(savename)
    
    goodbye = goodbye_printer(return_it=in_notebook)
    if in_notebook:
        try:
            p.children[2].value = goodbye.replace('\n', '')
        except AttributeError:
            pass
    if not root:
        signal.signal(signal.SIGINT, original_sigint)
    return interro
Пример #14
0
#!/usr/bin/env python3

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

print('-----------------Series排名排序----------------------')
obj = Series(range(4), index=['a', 'c', 'b', 'd'])
print(obj.sort_index())  #Series根据索引排序
print(obj.sort_values())  #Series根据值排序
obj = Series([7, -5, 7, 3, 4, 2])
print(obj.rank())  #rank:排名值 method:排名时用于破坏平级关系的选项
print(obj.rank(method='first'))
print(obj.rank(method='max'))
print(obj.rank(method='min'))
#first按值在原始数据中出现顺序分配排名
#max使用整个分组的最大排名
#min使用整个分组的最小排名
#average 默认:在相等分组中,为各个值分配平均排名
print('-----------------Dataframe排名排序----------------------')
frame = DataFrame(np.arange(8).reshape(2, 4),
                  index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(frame)
print(frame.sort_index())  #根据行索引排序
print(frame.sort_index(axis=1))  #根据列索引排序
print(frame.sort_index(axis=1, ascending=False))  #倒序
df = DataFrame({'a': [4, 7, -3, 2], 'b': [0, 1, 0, 1]})
print(df)
print(df.sort_values(by=['a'], ascending=False))  #根据列值进行排序
print(df.rank(axis=1))
Пример #15
0
def get_xy(SEC, N_CLUSTER, METRIC):

    distancepath = DATAPATH + '/usermap{}.hdf'.format(SEC)
    with h5py.File(distancepath, 'r') as distance_file:
        # sample distance matrix の名残り
        sdm = distance_file['adjacency_matrix'].value
        if SEC == 15:
            sdm = sdm / (240 * 240)
        elif SEC == 30:
            sdm = sdm / (120 * 120)
        else:
            raise Error('invalid SEC: {}'.format(SEC))

    # sample clustering matrix
    scm = np.load(file=DATAPATH +
                  "/cluster/cluster_{}_{:02d}.npy".format(METRIC, N_CLUSTER))

    # sample flat list from sdm
    sfldm = []
    for i in range(27):
        for j in range(i + 1, 27):
            sfldm.append(sdm[i, j])

    # sample flat list from scm
    sflcm = []
    for i in range(27):
        for j in range(i + 1, 27):
            sflcm.append(scm[i, j])

    # 01_16みたいなkeyを与える
    keys = []
    for i in range(27):
        for j in range(i + 1, 27):
            keys.append(str(i + 1).zfill(2) + "_" + str(j + 1).zfill(2))

    dmdic = dict(zip(keys, sfldm))
    cmdic = dict(zip(keys, sflcm))

    # ソート
    dmser = Series(dmdic)
    dmser = dmser.sort_values()
    print(dmser.keys()[:50])

    sorted_cmlist = []
    sorted_keylist = []
    for i in dmser.keys():
        if (not ("16" in i)):
            sorted_cmlist.append(cmdic[i])
            sorted_keylist.append(i)

    dmser[sorted_keylist]

    # 積算値を計算
    numerator = 0
    denominator = 0
    value_list = []
    for i in sorted_cmlist:
        denominator += 1
        if (i == 1):
            numerator += 1
        value_list.append(numerator / denominator)
    print(len(value_list))

    x = dmser[sorted_keylist].values
    y = value_list

    return x, y
Пример #16
0
# location of the max element
s.idxmax()

# rank
s = Series([4, 1, 2, 5])
s.rank()                     # return [3,1,2,4]

# plot
s.plot()
plt.show()

# translate ##################################################
# sort
new_s1 = s.sort_index()       # sort by index
new_s2 = s.sort_values()      # sort by values

# reindex includes the following steps:
# 1. Reordering existing data to match a set of labels.
# 2. Inserting NaN markers where no data exists for a label.
# 3. Possibly, filling missing data for a label using some type
#    of logic

# in-place modify ############################################

# change index directly, the new index should
# has the same length
s.index = pd.Index(['A', 'B', 'C', 'D'])
s.index = ['a', 'b', 'c', 'd']

# add a new item
Пример #17
0
 def _quantile_value(series: pd.Series):
     return series.sort_values(ascending=ascending).iloc[level]
Пример #18
0
Name: A, dtype: object
'''

print
'Series排序'
x = Series(range(4), index=['b', 'a', 'c', 'd'])
print
x.sort_index()  # Series按索引排序
'''
a    1
b    0
c    2
d    3
'''
print
x.sort_values()  # Series按值排序
'''
b    0
a    1
c    2
d    3
'''

print
'DataFrame按索引排序'
frame = DataFrame(numpy.arange(8).reshape((2, 4)),
                  index=['b', 'a'],
                  columns=list('ABDC'))
print
frame
'''
Пример #19
0
#!/usr/bin/env python
# encoding=utf-8

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

# 逐块读取文件
# 在处理大文件时,或找出大文件中的参数集以便后续处理时,可以读取文件的一小部分或逐块对文件进行迭代

result = pd.read_csv('ex6.csv')
# 可以指定读取其中的几行,通过nrows来指定,即读取前几行
result_part = pd.read_csv('ex6.csv', nrows=5)
print result_part

# 可以读取逐块读取文件,需要设置chunksize(行数)
chunker = pd.read_csv('ex6.csv', chunksize=1000)
# chunker是一个TextFileReader
print chunker
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
# 降序,order修改为sort_values
tot = tot.sort_values(ascending=False)
print tot
print tot[:10]
# print chunker.get_chunk(500)
Пример #20
0
class QuantileForest(RandomForestRegressor):
    """Quantile Regresion Random Forest.
      This class can build random forest using Scikit-Learn and compute
      conditional quantiles.

      Parameters
      ----------
      inputSample : array
        Input samples used in data

      outputSample : array
        Output samples used in data

      n_estimators : int, optional (default=50)
        The number of trees in the forest.

      max_leaf_nodes : int or None, optional (default=max(10, len(outputSample)/100))
        Grow trees with max_leaf_nodes in best-first fashion. Best nodes are
        defined as relative reduction in impurity. If None then unlimited
        number of leaf nodes. If not None then max_depth will be ignored.
        Note: this parameter is tree-specific.

      n_jobs : int, optional (default=4)
        The number of jobs to run in parallel for both fit and predict. If -1,
        then the number of jobs is set to the number of cores.

      numPoints : int, optional (default=0)
        The size of the vector used to determines the quantile. If 0, the
        vector use is the outputSample.

      outputSample : string, optional (default="Cobyla")
        Name of the Optimisation method to find the alpha-quantile (if the
        option is chosen in the computeQuantile method). Only "Cobyla" and
        "SQP" are available.

      random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random.
    """

    def fit(self, X, y):
        """

        """
        # We transform X as a np array for use convenience
        X = np.asarray(X)

        # It's a vector
        if X.shape[0] == X.size:
            self._n_sample = X.shape[0]
            self._input_dim = 1
        else:
            self._n_sample, self._input_dim = X.shape

        # The bootstrap is mandatory for the method. Since update 
        # 1.16 of Sklearn, the indices of each element are not 
        # availables. TODO: find a way to get OOB indices.
        self.bootstrap = False

        # Fit the forest
        RandomForestRegressor.fit(self, X, y)

        # Save the data. Necessary to compute the quantiles.
        self._input_sample = DataFrame(X)
        self._output_sample = Series(y)

        # The resulting node of each elements of the sample
        self._sample_nodes = DataFrame(self.apply(X))  

        return self

    def _check_input(self, X):
        """

        """
        n = X.shape[0]  # Number of sample
        try:  # Works if X is an array
            d = X.shape[1]  # Dimension of the array
            if d != self._input_dim:  # If the dimension is not correct
                if n == self._input_dim:  # There is one sample of d dimension
                    d = n
                    n = 1
                else:  # Error
                    raise ValueError("X dimension is different from forest \
                    dimension : %d (X) != %d (forest)" % (d, self._input_dim))
        except:  # Its a vector
            d = 1
            if d != self._input_dim:  # If the dimension is not correct
                if n == self._input_dim:  # There is one sample of d dimension
                    d = n
                    n = 1
                else:  # Error
                    raise ValueError("X dimension is different from forest \
                    dimension : %d (X) != %d (forest)" % (d, self._input_dim))

        if (n > 1) & (d == 1):
            X.resize(n, 1)

        return X, n

    def _compute_weight(self, X_nodes_k, i_tree):
        """
        """
        if i_tree < 0:
            sample_node = self._sample_nodes.values
        else:
            sample_node = self._nodesOfSamples.values[:, i_tree]
        tmp = (sample_node == X_nodes_k)

        # Number of samples in nodes
        n_samples_nodes = tmp.sum(axis=0)

        # The proportion in each node
        # Shape : Matrix (numSample * numTree)
        weight = tmp.astype(float) / n_samples_nodes

        # The weight of each sample in the trees
        # Shape : Vector (numSample * )
        if i_tree < 0:
            return weight.mean(axis=1)
        else:
            return weight

    def get_nodes(self, X, i_tree):
        """
        """
        X, n_quantiles = self._check_input(X)

        # Nodes of the regressor in all the trees
        # Shape : (numTree * numRegressor)
        if i_tree < 0:
            # Sklearn does not like arrays of one values...
            if n_quantiles == 1 and self._input_dim == 1:
                X_nodes = self.apply(X[0]).transpose()
            else:
                X_nodes = self.apply(X).transpose()
        else:
            tree = self.estimators_[i_tree].tree_
            X_nodes = tree.apply(X.astype(np.float32))
            X_nodes.resize((1, n_quantiles))

        return X_nodes

    def compute_CDF(self, X, y, i_tree=-1):
        """
        """
        if isinstance(X, (int, float)):
            X = [X]
        if isinstance(y, (int, float)):
            y = [y]

        # Converting to array for convenience
        X = np.asarray(X)
        y = np.asarray(y)
        X, n_X = self._check_input(X)
        n_y = y.shape[0]
        y.resize(n_y, 1)
        
        self._prepare_CDF()

        CDFs = np.zeros((n_y, n_X))
        X_nodes = self.get_nodes(X, i_tree)

        # For each fixed X
        for k in range(n_X):
            weight = self._compute_weight(X_nodes[:, k], i_tree)
            id_pos = weight > 0
            tmp = weight[id_pos] * (self._output_sample.values[id_pos] <= y)
            CDFs[:, k] = tmp.sum(axis=1)
            print tmp
        return CDFs

    def compute_quantile(self, X, alpha, do_optim=True, verbose=False,
                         doSaveCDF=False, i_tree=-1, opt_method="Cobyla"):
        """
        Compute the conditional alpha-quantile.
        """
        if isinstance(alpha, float):
            alpha = [alpha]
        if isinstance(X, (int, float)):
            X = [X]

        # Converting to array for convenience
        alpha = np.asarray(alpha)
        X = np.asarray(X)

        # Number of quantiles to compute
        X, n_quantiles = self._check_input(X)
        n_alphas = alpha.size  # Number of probabilities

        # Matrix of computed quantiles
        quantiles = np.zeros((n_quantiles, n_alphas))

        if doSaveCDF or not do_optim:
            self._prepare_CDF()
        if doSaveCDF:
            self._CDF = np.empty((self._yCDF.size, n_quantiles))

        X_nodes = self.get_nodes(X, i_tree)

        # For each quantiles to compute
        for k in range(n_quantiles):
            weight = self._compute_weight(X_nodes[:, k], i_tree)

            # Compute the quantile by minimising the pinball function
            if do_optim:
                # The starting points are the percentiles
                # of the non-zero weights.
                y0 = np.percentile(self._output_sample[
                                   weight != 0], alpha * 100.)

                # For each alpha
                for i, alphai in enumerate(alpha):
                    # The quantile is obtain by the minimisation of the
                    # weighted check function.
                    if opt_method == "Cobyla":
                        quantiles[k, i] = fmin_cobyla(self._min_function,
                                                      y0[i], [],
                                                      args=(weight, alphai),
                                                      disp=verbose)

                    elif opt_method == "SQP":
                        epsilon = 1.E-1 * abs(y0[i])
                        quantiles[k, i] = fmin_slsqp(self._min_function,
                                                     y0[i],
                                                     args=(weight, alphai),
                                                     disp=verbose,
                                                     epsilon=epsilon)
                    else:
                        raise ValueError("Unknow optimisation method %s" %
                                         opt_method)
            else:
                CDF = self._infYY.dot(weight).ravel()  # Compute the CDF
                quantiles[k, :] = [self._yCDF.values[CDF >= alphai][0]
                                   for alphai in alpha]
                if doSaveCDF:
                    self._CDF[:, k] = CDF

        if n_quantiles == 1 and n_alphas == 1:
            return quantiles[0][0]
        elif n_quantiles == 1 or n_alphas == 1:
            return quantiles.ravel()
        else:
            return quantiles

    def _min_function(self, yi, w, alpha):
        """
        Minimisation function used to compute the conditional quantiles.
        The function need the curret value of $y$, the weight of each observation
        and the alpha value. The check function of the residual between $y_i$ and the
        output sample, pondered with the weight is minimised.
        """
        # Weighted deviation between the current value and the output sample.
        # TODO: Think about using only the non-null weight to increases performances
        u = w*(self._output_sample.values - yi)
        return check_function(u, alpha).sum()
    
# ==============================================================================
# Setters
# ==============================================================================
    def _prepare_CDF(self):
        """
        If the value is set at 0, we will take the quantile from the output
        sample. Else we can create new sample to find the quantile
        """
        self._yCDF = self._output_sample.sort_values(inplace=False)

        # Matrix of output samples inferior to a quantile value
        out_martrix = self._output_sample.reshape(self._n_sample, 1)
        cdf_matrix = self._yCDF.reshape(self._yCDF.size, 1).T
        self._infYY = DataFrame(out_martrix <= cdf_matrix).T

    def _computeImportanceOfTree(self, alpha, i):
        """

        """
        oob = self._oobID[i]
        X_oob = self._inputSample.values[oob, :]
        Yobs_oob = self._outputSample.values[oob]
        Yest_oob = self.computeQuantile(X_oob, alpha, i_tree=i)
        baseError = (check_function(Yobs_oob, Yest_oob, alpha)).mean()

        permError = np.empty(self._input_dim)
        for j in range(self._input_dim):
            X_oob_perm = np.array(X_oob)
            np.random.shuffle(X_oob_perm[:, j])
            Yest_oob_perm = self.computeQuantile(X_oob_perm, alpha, i_tree=i)
            permError[j] = check_function(Yobs_oob, Yest_oob_perm, alpha)\
                .mean()

        return (permError - baseError)

    def compute_importance(self, alpha):
        """

        """
        pool = ProcessingPool(self._numJobs)
        errors = pool.map(self._computeImportanceOfTree,
                          [alpha] * self._numTree, range(self._numTree))
        return np.array(errors).mean(axis=0)
Пример #21
0
        index += 1
    return True

# Get a list of all the words in Brown corpus.
words = brown.words()

# Get frequency distribution on the given condition.
sent_fd = nltk.FreqDist(
            word.lower() for word in words
            if len(word) == length and
               check_condition(word, userinput)
        )               

# Display the top 3 frequent words if applicable.                
series = Series(sent_fd)
series.sort_values(ascending=False, inplace=True)
sumValues = series.sum()
top_words = series.keys()
count = len(top_words)
if count > 0:
    i = 0
    while i < count and i < 3:
        print(str(i + 1) + ': ' + top_words[i] + '  (' + 
              str(round(100 * series.get(i) / sumValues, 1)) + ' %)')
        i += 1
else:
    print("It doesn't seem like there is any word like that.")



Пример #22
0
print np.argmax(walkcum[index],axis=1)
print np.mean(np.argmax(walkcum[index],axis=1))
pd.Index

obj = Series([1,2,3])

obj.reindex()

data = DataFrame([[1,2,3],[4,5,6]])
data.drop()

np.argsort()

obj.rank()

obj.sort_values()


data.tail()

data.cov()

data.cov()

data.corr()

data.dropna()

data.loc

from pandas import Series

# In[119]:

feature_importance = model.feature_importances_
Series_feat_imp = Series(feature_importance, index=df_test.columns)

# In[120]:

Series_feat_imp.sort_values

# In[121]:

plt.figure(figsize=(8, 8))
Series_feat_imp.sort_values(ascending=True).plot.barh()
plt.xlabel('Feature importance')
plt.ylabel('Feature')
plt.show()

# In[122]:

submission = pd.read_csv('../input/gender_submission.csv')

# In[123]:

submission.head()

# In[124]:

prediction = model.predict(X_test)
Пример #24
0
def interrogator(corpus, 
            search, 
            query = 'any', 
            show = 'w',
            exclude = False,
            excludemode = 'any',
            searchmode = 'all',
            dep_type = 'collapsed-ccprocessed-dependencies',
            case_sensitive = False,
            quicksave = False,
            just_speakers = False,
            preserve_case = False,
            lemmatag = False,
            files_as_subcorpora = False,
            only_unique = False,
            random = False,
            only_format_match = False,
            multiprocess = False,
            spelling = False,
            regex_nonword_filter = r'[A-Za-z0-9:_]',
            gramsize = 2,
            split_contractions = False,
            do_concordancing = False,
            maxconc = 9999,
            **kwargs):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""

    only_conc = False
    no_conc = False
    if do_concordancing is False:
        no_conc = True
    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        only_conc = True
        no_conc = False

    # iteratively count conc lines
    numconc = 0

    # store kwargs
    locs = locals()
    
    if kwargs:
        for k, v in kwargs.items():
            locs[k] = v
        locs.pop('kwargs', None)

    import corpkit
    from interrogation import Interrogation
    from process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from other import as_regex
    from process import get_deps
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    from process import animator
    from dictionaries.word_transforms import wordlist, taglemma
    import corenlp_xml
    import codecs
    import signal

    original_sigint = signal.getsignal(signal.SIGINT)

    if kwargs.get('paralleling', None) is None:
        original_sigint = signal.getsignal(signal.SIGINT)
        
        def signal_handler(signal, frame):
            """pause on ctrl+c, rather than just stop loop"""   
            import signal
            import sys
            from time import localtime, strftime
            signal.signal(signal.SIGINT, original_sigint)
            thetime = strftime("%H:%M:%S", localtime())
            try:
                sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            except NameError:
                sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            time = strftime("%H:%M:%S", localtime())
            print('%s: Interrogation resumed.\n' % time)
            signal.signal(signal.SIGINT, signal_handler)

        signal.signal(signal.SIGINT, signal_handler)

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')

    # convert path to corpus object
    if type(corpus) == str:
        from corpus import Corpus
        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from process import searchfixer
    search, search_iterable = searchfixer(search, query)
    
    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(list(search.keys())) == 1:
        query = list(search.values())[0]

    if 'l' in show and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr=WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict
        if hasattr(corpus, '__iter__'):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != list(search.values())[0] or len(list(search.keys())) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == 'each':
                im = True
                just_speakers = ['each']
            if just_speakers == ['each']:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in list(search.values())):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        speakr = dummy_args.get('speaker', False)
        import os
        from process import tregex_engine
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        to_write = '\n'.join([sent._parse_string.strip() for sent in sents \
                              if sent.parse_string is not None])
        to_write.encode('utf-8', errors = 'ignore')
        with open(to_open, "w") as fo:
            encd = to_write.encode('utf-8', errors = 'ignore') + '\n'
            fo.write(encd)
        q = list(search.values())[0]
        ops = ['-o', '-%s' % translated_option]
        concs = []
        res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True)
        if not no_conc:
            ops += ['-w', '-f']
            whole_res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True) 

            res = format_tregex(res)
            whole_res = format_tregex(whole_res, whole = True)
            concs = make_conc_lines_from_whole_mid(whole_res, res, speakr)

        if root:
            root.update()
        try:
            os.remove(to_open)
        except OSError:
            pass
        if countmode:
            return(len(res))
        else:
            return res, concs

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, 
                                       speakr = False):
        import re, os
        if speakr is False:
            speakr = ''
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if '-join-'.join([f, whole, mid]) not in duplicates:
                duplicates.append('-join-'.join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)]
            for offstart, offend in offsets:              
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith('u'):
                if word.lower() in list(taglemma.keys()):
                    word = taglemma[word.lower()]
                else:
                    if word == 'x':
                        word = 'Other'
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag = False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {'N': 'n',
                   'A': 'a',
                   'V': 'v',
                   'A': 'r',
                   'None': False,
                   '': False,
                   'Off': False}

        if lemmatag is False:
            tag = 'n' # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)')
            tagchecker = re.compile(r'^[A-Z]{1,4}$')
            qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '')
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], 'n')
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results, whole = False):
        """format tregex by show list"""
        if countmode:
            return results
        import re
        done = []
        
        if whole:
            fnames = [x for x, y in results]
            results = [y for x, y in results]

        if 'l' in show or 'pl' in show:
            lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get('w'):
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('w'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('l'), lemma):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('p'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('pl'), lemma):
                        continue
            if exclude and excludemode == 'all':
                num_to_cause_exclude = len(list(exclude.keys()))
                current_num = 0
                if exclude.get('w'):
                    if re.search(exclude.get('w'), word):
                        current_num += 1
                if exclude.get('l'):
                    if re.search(exclude.get('l'), lemma):
                        current_num += 1
                if exclude.get('p'):
                    if re.search(exclude.get('p'), word):
                        current_num += 1
                if exclude.get('pl'):
                    if re.search(exclude.get('pl'), lemma):
                        current_num += 1   
                if current_num == num_to_cause_exclude:
                    continue                 

            for i in show:
                if i == 't':
                    bits.append(word)
                if i == 'l':
                    bits.append(lemma)
                elif i == 'w':
                    bits.append(word)
                elif i == 'p':
                    bits.append(word)
                elif i == 'pl':
                    bits.append(lemma)
            joined = '/'.join(bits)
            done.append(joined)

        if whole:
            done = zip(fnames, done)

        return done

    def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True):
        from collections import Counter
        import re
        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == 'any':
            pattern = r'.*'

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)
            
            #list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index+x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[' '.join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in list(ngrams.items()):
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return(len(result))
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value,
                          exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        comped = compiler(pattern)
        if comped == 'Bad query':
            return 'Bad query'
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re
        if concordancing:
            pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})'
        compiled_pattern = compiler(pattern)
        if compiled_pattern == 'Bad query':
            return 'Bad query'
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return(len(matches))
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        spell_out = []
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re
        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})'
            pat = compiler(pat)
            if pat == 'Bad query':
                return 'Bad query'
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:   
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)
    
    locs['search'] = search
    locs['query'] = query
    locs['just_speakers'] = just_speakers
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess

    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from multiprocess import pmultiquery
        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    count_results = {}
    conc_results = {}
    # check if just counting
    countmode = 'c' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    ############################################
    # Determine the search function to be used #
    ############################################
    
    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and 't' in list(search.keys()):
        simple_tregex_mode = True
    else:
        if corpus.datatype == 'plaintext':
            if search.get('n'):
                raise NotImplementedError('Use a tokenised corpus for n-gramming.')
                #searcher = plaintext_ngram
                optiontext = 'n-grams via plaintext'
            if search.get('w'):
                if kwargs.get('regex', True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = 'Searching plaintext'

        elif corpus.datatype == 'tokens':
            if search.get('n'):
                searcher = tok_ngrams
                optiontext = 'n-grams via tokens'
            elif search.get('w'):
                if kwargs.get('regex', True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get('w')) == list:
                    searcher = tok_by_list
                optiontext = 'Searching tokens'
        only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l']
        if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())):
            raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse]))

        elif corpus.datatype == 'parse':
            if search.get('t'):
                searcher = slow_tregex
            elif search.get('s'):
                searcher = get_stats
                statsmode = True
                optiontext = 'General statistics'
                global numdone
                numdone = 0
                no_conc = True
                only_conc = False
                do_concordancing = False
            else:
                from depsearch import dep_searcher
                searcher = dep_searcher
                optiontext = 'Dependency querying'

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get('t'):
        translated_option = 't'
        query = search.get('t')

        # check the query
        q = tregex_engine(corpus = False, query = search.get('t'), 
                          options = ['-t'], check_query = True, root = root)
        if query is False:
            if root:
                return 'Bad query'
            else:
                return

        optiontext = 'Searching parse trees'
        if 'p' in show or 'pl' in show:
            translated_option = 'u'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 't' in show:
            translated_option = 'o'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 'w' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'c' in show:
            only_count = True
            translated_option = 'C'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __'  % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'l' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'

        query = search['t']

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for subcorpus in corpus.subcorpora:
            to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name):
        #    to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if no_conc:
        message = 'Interrogating'
    else:
        message = 'Interrogating and concordancing'
    if kwargs.get('printstatus', True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = '\n                 '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())])
        if search == {'s': r'.*'}:
            sformat = 'features'
        welcome = '\n%s: %s %s ...\n          %s\n          Query: %s\n          %s corpus ... \n' % \
                  (thetime, message, corpus.name, optiontext, sformat, message)
        print(welcome)

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(list(to_iterate_over.keys()))
    else:
        if search.get('s'):
            total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12
        else:
            total_files = sum([len(x) for x in list(to_iterate_over.values())])

    par_args = {'printstatus': kwargs.get('printstatus', True),
                'root': root, 
                'note': note,
                'length': total_files,
                'startnum': kwargs.get('startnum'),
                'denom': kwargs.get('denominator', 1)}

    term = None
    if kwargs.get('paralleling', None) is not None:
        from blessings import Terminal
        term = Terminal()
        par_args['terminal'] = term
        par_args['linenum'] = kwargs.get('paralleling')

    outn = kwargs.get('outname', '')
    if outn:
        outn = outn + ': '
    tstr = '%s%d/%d' % (outn, current_iter, total_files)
    p = animator(None, None, init = True, tot_string = tstr, **par_args)
    tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        conc_results[subcorpus_name] = []
        count_results[subcorpus_name] = []
        results[subcorpus_name] = Counter()
        
        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ['-o', '-' + translated_option]                
            result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)

            if not countmode:
                result = format_tregex(result)

            if not no_conc:
                op += ['-w', '-f']
                whole_result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)
                
                if not only_format_match:
                    whole_result = format_tregex(whole_result, whole = True)

                conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False)

            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                result = Counter(result)
                results[subcorpus_name] += result
                if not no_conc:
                    for lin in conc_result:
                        if numconc < maxconc or not maxconc:
                            conc_results[subcorpus_name].append(lin)
                        numconc += 1

            current_iter += 1
            if kwargs.get('paralleling', None) is not None:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            else:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)

            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:
                slow_treg_speaker_guess = kwargs.get('outname', False)
                if corpus.datatype == 'parse':
                    with open(f.path, 'r') as data:
                        data = data.read()
                        from corenlp_xml.document import Document
                        try:
                            corenlp_xml = Document(data)
                        except:
                            print('Could not read file: %s' % f.path)
                            continue
                        if just_speakers:  
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if len(just_speakers) == 1:
                                slow_treg_speaker_guess = just_speakers[0]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res, conc_res = searcher(sents, search = search, show = show,
                            dep_type = dep_type,
                            exclude = exclude,
                            excludemode = excludemode,
                            searchmode = searchmode,
                            lemmatise = False,
                            case_sensitive = case_sensitive,
                            do_concordancing = do_concordancing,
                            only_format_match = only_format_match,
                            speaker = slow_treg_speaker_guess)
                        
                        if res == 'Bad query':
                            return 'Bad query'

                elif corpus.datatype == 'tokens':
                    import pickle
                    with codecs.open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    if not only_conc:
                        res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = False)
                    if not no_conc:
                        conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = True)
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            line.insert(0, '')

                elif corpus.datatype == 'plaintext':
                    with codecs.open(f.path, 'rb', encoding = 'utf-8') as data:
                        data = data.read()
                        if not only_conc:
                            res = searcher(list(search.values())[0], data, 
                            concordancing = False)
                        if not no_conc:
                            conc_res = searcher(list(search.values())[0], data, 
                            concordancing = True)
                        if not no_conc:
                            for index, line in enumerate(conc_res):
                                line.insert(0, '')

                if countmode:
                    count_results[subcorpus_name] += [res]
                else:
                    # add filename and do lowercasing for conc
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            if searcher != slow_tregex:
                                line.insert(0, f.name)
                            else:
                                line[0] = f.name
                            if not preserve_case:
                                line[3:] = [x.lower() for x in line[3:]]
                            if spelling:
                                line = [correct_spelling(b) for b in line]
                            if numconc < maxconc or not maxconc:
                                conc_results[subcorpus_name].append(line)
                                numconc += 1

                    # do lowercasing and spelling
                    if not only_conc:
                        if not preserve_case:
                            if not statsmode:
                                res = [i.lower() for i in res]
                        if spelling:
                            if not statsmode:
                                res = [correct_spelling(r) for r in res]
                        #if not statsmode:
                        results[subcorpus_name] += Counter(res)
                        #else:
                        #results[subcorpus_name] += res

                if not statsmode:
                    current_iter += 1
                    if kwargs.get('paralleling', None) is not None:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    else:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    animator(p, current_iter, tstr, **par_args)

    # delete temp file if there
    import os
    if os.path.isfile('tmp.txt'):
        os.remove('tmp.txt')

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if not no_conc:
        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            pindex = 'c f s l m r'.encode('utf-8').split()
            for fname, spkr, start, word, end in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                fname = os.path.basename(fname)
                all_conc_lines.append(Series([sc_name,
                                     fname, \
                                     spkr, \
                                     start, \
                                     word, \
                                     end], \
                                     index = pindex))

        # randomise results...
        if random:
            from random import shuffle
            shuffle(all_conc_lines)

        conc_df = pd.concat(all_conc_lines, axis = 1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r']
        else:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link']

        if all(x == '' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis = 1, inplace = True)

        #if kwargs.get('note'):
        #    kwargs['note'].progvar.set(100)

        #if kwargs.get('printstatus', True):
        #    thetime = strftime("%H:%M:%S", localtime())
        #    finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index))
        #    print(finalstring)

        from interrogation import Concordance
        output = Concordance(conc_df)
        if only_conc:
            output.query = locs
            if quicksave:
                output.save()

            if kwargs.get('printstatus', True):
                thetime = strftime("%H:%M:%S", localtime())
                finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df))
                print(finalstring)
            return output

        #output.query = locs

        #return output 

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    if not only_conc:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(count_results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in list(results.values()) for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index = sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis = 1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get('df1_always_df'):
                        df = Series(df.ix[0])
                        df.sort_values(ascending = False, inplace = True)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:   
                df.ix['Total-tmp'] = df.sum()
                the_tot = df.ix['Total-tmp']
                df = df[the_tot.argsort()[::-1]]
                df = df.drop('Total-tmp', axis = 0)

        # format final string
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %d matches.' % tot
            else:
                finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total)
            print(finalstring)

        if not no_conc:
            interro = Interrogation(results = df, totals = tot, query = locs, concordance = output)
        else:
            interro = Interrogation(results = df, totals = tot, query = locs)

        if quicksave:
            interro.save()
        
        return interro
import numpy as np
import pandas as pd
from pandas import Series
from numpy.random import randn

ser1 = Series([500, 1000, 1500], index=['a', 'c', 'b'])
print ser1

# sorting arrays by index
print ser1.sort_index()

# Sort by values
print "Sort by values"
print ser1.sort_values()

print ser1.rank()

# Ranking of series is the basics of sorting
ser2 = Series(randn(10))
print ser2
print ser2.rank()
print ser2.sort_values()
print ser2.rank()
Пример #26
0
    train_data = []
    train_target = train_file.sentiment_value[:9000]
    for i in range(9000):
        if key_word in CommentWord[train_file.content_id[i]]:
            train_data.append(1)
        else:
            train_data.append(0)

    BayesClassifer = BernoulliNB()

    train_x, test_x, train_y, test_y = cross_validation.train_test_split(
        train_data, train_target, test_size=0.1, random_state=13)

    y_one_hot = label_binarize(test_y, classes=[-1, 0, 1])

    BayesClassifer.fit(np.array(train_x).reshape(-1, 1), train_y)
    PredictProbability = BayesClassifer.predict_proba(
        np.array(test_x).reshape(-1, 1))
    test_auc = test_auc = metrics.roc_auc_score(y_one_hot, PredictProbability)
    CommentDictionary[key_word] = test_auc
    print(key_word + '\t\ttest_auc\t' + str(test_auc))

#CommentDictionary = dict(sorted(CommentDictionary.items(), key=lambda d: d[1],reverse=True))

OutputInformation = Series(CommentDictionary)
ContentInformation = Series(CommentWord)
OutputInformation.sort_values(
    ascending=False).to_csv('./key_word_for_sentiment_value.csv')
ContentInformation.to_csv('./content_word.csv')
input()
    def test_sort_values(self):

        # check indexes are reordered corresponding with the values
        ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D'])
        expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C'])
        result = ser.sort_values()
        tm.assert_series_equal(expected, result)

        ts = self.ts.copy()
        ts[:5] = np.NaN
        vals = ts.values

        result = ts.sort_values()
        assert np.isnan(result[-5:]).all()
        tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:]))

        # na_position
        result = ts.sort_values(na_position='first')
        assert np.isnan(result[:5]).all()
        tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:]))

        # something object-type
        ser = Series(['A', 'B'], [1, 2])
        # no failure
        ser.sort_values()

        # ascending=False
        ordered = ts.sort_values(ascending=False)
        expected = np.sort(ts.dropna().values)[::-1]
        assert_almost_equal(expected, ordered.dropna().values)
        ordered = ts.sort_values(ascending=False, na_position='first')
        assert_almost_equal(expected, ordered.dropna().values)

        # ascending=[False] should behave the same as ascending=False
        ordered = ts.sort_values(ascending=[False])
        expected = ts.sort_values(ascending=False)
        assert_series_equal(expected, ordered)
        ordered = ts.sort_values(ascending=[False], na_position='first')
        expected = ts.sort_values(ascending=False, na_position='first')
        assert_series_equal(expected, ordered)

        pytest.raises(ValueError,
                      lambda: ts.sort_values(ascending=None))
        pytest.raises(ValueError,
                      lambda: ts.sort_values(ascending=[]))
        pytest.raises(ValueError,
                      lambda: ts.sort_values(ascending=[1, 2, 3]))
        pytest.raises(ValueError,
                      lambda: ts.sort_values(ascending=[False, False]))
        pytest.raises(ValueError,
                      lambda: ts.sort_values(ascending='foobar'))

        # inplace=True
        ts = self.ts.copy()
        ts.sort_values(ascending=False, inplace=True)
        tm.assert_series_equal(ts, self.ts.sort_values(ascending=False))
        tm.assert_index_equal(ts.index,
                              self.ts.sort_values(ascending=False).index)

        # GH 5856/5853
        # Series.sort_values operating on a view
        df = DataFrame(np.random.randn(10, 4))
        s = df.iloc[:, 0]

        def f():
            s.sort_values(inplace=True)

        pytest.raises(ValueError, f)
Пример #28
0
    from cluster import density_cluster

    name = 'path'
    distance_c = 12.3972318748
    m = '3_44'
    pile = 0
    id = np.load(Properties.getRootPath() + "/data/cache/" + name + "/id.npy")
    data = np.load(Properties.getRootPath() + "/data/cache/" + name + "/data.npy")
    id_index = Series(id.tolist())
    from cluster import density_cluster
    index_id = Series(id_index.index, index=id_index.values)
    distance = density_cluster.compute_distance(data)
    pile_id = DataFrame([], columns=['pile', 'size'])
    rho_id = density_cluster.rho_function(index_id, distance, distance_c=distance_c)
    rho_id = Series(rho_id, index=index_id.index)
    rho_id = rho_id.sort_values(ascending=False)
    #delta_id, data_id = density_cluster.delta_function(id_index, index_id, rho_id, distance)
    log.debug(rho_id)
    pile=['3_44']
    pile_max=14
    pile = density_cluster.pile_brother(index_id, id_index, distance, distance_c, pile,pile_max)
    log.debug("pile info:")
    log.debug(pile)
    distance_view(m, index_id, id_index, distance)
    log.debug("cluster_view: " + str(rho_id[index_id[m]]))
    cluster_distance_view(m, index_id, id_index, distance, distance_c)

    """
    import numpy
    import multiprocessing
       d = DataFrame([], columns=['i_id', 'j_id', 'i', 'j', 'value'])
series_custom = Series(rt_scores, index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]

fiveten = series_custom[5:11]

## 5. Reindexing ##

original_index = series_custom.index

sorted_by_index = series_custom.reindex(index=(sorted(original_index)))

## 6. Sorting ##

sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()

print(sc3[:10])
print(sc2[:10])

## 7. Transforming Columns With Vectorized Operations ##

series_normalized = series_custom / 20

## 8. Comparing and Filtering ##

criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]

## 9. Alignment ##
Пример #30
0
from pandas import Series

series = Series([4400, 200, 3300, 400, 500], index=[3, 5, 2, 4, 1])
print(series)

# sort by indexes
series = series.sort_index()
print(series)

# sort by values ascending
series = series.sort_values()
print(series)

# sort by values descending
series = series.sort_values(ascending=False)
print(series)
Пример #31
0
 def assert_series_equal(
     cls, left: pd.Series, right: pd.Series, *args: Any, **kwargs: Any
 ) -> None:
     left = left.sort_values().reset_index(drop=True)
     right = right.sort_values().reset_index(drop=True)
     return super().assert_series_equal(left, right, *args, **kwargs)
Пример #32
0
# c    3
# dtype: int64
print(obj)

# 按索引排序
obj2 = obj.sort_index()
# a    1
# b    2
# c    3
# d    0
# dtype: int64
print(obj2)

# 还可以指定升序、降序排列
obj2 = obj.sort_index(ascending=False)
# d    0
# c    3
# b    2
# a    1
# dtype: int64
print(obj2)

# 按值排序(有些书上使用order方法,但当前版本的pandas并没有该方法)
obj2 = obj.sort_values()
# d    0
# a    1
# b    2
# c    3
# dtype: int64
print(obj2)
Пример #33
0
import numpy as np
from pandas import Series
from collections import defaultdict
array =np.array([abs(int(x)) for x in np.random.randn(100)*30])
print(array)

def allMod(numberArray):
    dict =defaultdict(int)
    for number in numberArray:
        dict[number] += 1
    return dict


allMod = allMod(array)
print(allMod)
array = Series(allMod)
print(array)
print(array.sort_values(ascending=False))
print(array.sort_values(ascending=False).index[0])

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

#Series的排序
s1 = Series(np.random.randn(10))
print(s1)
print(s1.index)
print(s1.values)
s2 = s1.sort_values(ascending=False)
print(s2)
s3 = s2.sort_index()
print(s3)

#DataFrame排序

df1 = DataFrame(np.random.randn(20).reshape(4, 5),
                columns=["c1", "c2", "c3", "c4", "c5"])
print(df1)

df2 = df1.sort_values('c2', ascending=False)  #按照C2列排序
print(df2)
df3 = df2.sort_index()  #按照index排序
print(df3)

#作业

movie = pd.read_csv("movie_metadata.csv")
tmp = DataFrame([
    movie['director_name'], movie['imdb_score'], movie["movie_facebook_likes"]
]).T.sort_values('imdb_score', ascending=False)
Пример #35
0
#逐块读取文本文件
result = pd.read_csv('../pydata-book-1st-edition/ch06/ex6.csv')
result
pd.read_csv('../pydata-book-1st-edition/ch06/ex6.csv',nrows=5)   #读取前5行

#设置文件块的大小,逐块读取
chunker = pd.read_csv('../pydata-book-1st-edition/ch06/ex6.csv', chunksize=1000)
chunker    #对象类型为TextParser,该对象有10000/1000个元素(10),每个元素是一个dataframe,大小为1000行

tot = Series([])
for piece in chunker:       #遍历chunker的每个元素
	print piece['key'].value_counts()   #打印每个元素‘key’列的值计数结果,返回的是series
	tot = tot.add(piece['key'].value_counts(), fill_value=0)
#巧妙利用series的值相加的自动对齐功能,重叠的索引名相加,不重叠的为缺失,这里由于空的series与第一个元素的series
#相加会全部为空,则利用fill_value填充缺失值(注意是先填充后相加),可参见下面的例子
tot = tot.sort_values(ascending=False)  #series无order属性,可用sort_values
tot

#空series与非空series相加,两种情况对比分析
Series([]).add(Series([1,2,3],index=['a','b','c']))                #全为空
Series([]).add(Series([1,2,3],index=['a','b','c']),fill_value=0)   #正常,说明先填充缺失再相加

#写入文本
data = pd.read_csv('../pydata-book-1st-edition/ch06/ex5.csv')
data

#to_csv写入csv文件
data.to_csv('../pydata-book-1st-edition/ch06/out.csv')
!type ..\pydata-book-1st-edition\ch06\out.csv

#写到sys.stdout仅仅是打印结果而已,分隔符为‘|’
Пример #36
0
ser3 = dframe2.iloc[0]
dframe2 - ser3


"""
Rank and Sort
"""

# not in order so that I can sort it
ser1 = Series(np.arange(3), index=['C','A','B'])

# sorts by index
ser1.sort_index()

# sort by value not index
ser1.sort_values()
ser2 = Series(randn(10))

# sort by value lowest to highes
ser2.sort_values()

# ranking the values
ser2.rank()
ser2.rank()

ser3 = Series(randn(10))
ser3 = ser3.rank()
ser3.sort_values()


"""
'''
C    0
A    1
B    2
dtype: int64
'''

print(ser1.sort_index())
'''
A    1
B    2
C    0
dtype: int64
'''

print(ser1.sort_values())
'''
C    0
A    1
B    2
dtype: int64
'''

# ranking
from numpy.random import randn

ser2 = Series(randn(10))
print(ser2)
'''
0    1.290655
1    1.383484
Пример #38
0
print np.argmax(walkcum[index], axis=1)
print np.mean(np.argmax(walkcum[index], axis=1))
pd.Index

obj = Series([1, 2, 3])

obj.reindex()

data = DataFrame([[1, 2, 3], [4, 5, 6]])
data.drop()

np.argsort()

obj.rank()

obj.sort_values()

data.tail()

data.cov()

data.cov()

data.corr()

data.dropna()

data.loc

data.fillna()
Пример #39
0
frame8['b'].map(f)  #Series有一個元素級函數的map用法,所以上述也要用map

#排序和排名
yc11 = Series(np.arange(4), index=list('dabc'))
yc11.sort_index()  #自動是升序

frame9 = DataFrame(np.arange(8).reshape((2, 4)),
                   index=['three', 'one'],
                   columns=['d', 'a', 'c', 'b'])
frame9.sort_index()
frame9.sort_index(axis=1)  #columns排列

frame9.sort_index(axis=1, ascending=False)

yc12 = Series([7, 4, -3, 2])
yc12.sort_values()  #若按照值對Series進行排序,可以利用order

yc12_1 = Series([7, 4, np.nan, -3, np.nan, 2])
yc12.sort_values()  #好像這個版本中不會返回nan

frame10 = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame10
frame10.sort_index(by='b')
frame10.sort_values(by='b')  #如果想一個或多個列中的值進行排序,可用by
frame10.sort_values(by=['a', 'b'])

yc13 = Series([7, -5, 7, 4, 2, 0, 4])
yc13.rank()  #排名在默認條件下,會通過平均排名來返回值
yc13.rank(method='first')  #會根據值在原數據中出現的順序給出排名
yc13.rank(ascending=False, method='max')
yc13.rank(ascending=False, method='min')  #這個比較合理,同樣的數值排名一樣,其餘的會後一號排名
Пример #40
0
ser1 = Series([500,1000,1500],index=['a','c','b'])
ser1


# In[57]:


#sort by index
ser1.sort_index()


# In[58]:


#sort by values
ser1.sort_values()


# In[60]:


ser1.rank()


# In[65]:


#ranking of series
ser2 = Series(randn(10))
ser2
import numpy as np
from pandas import Series, DataFrame

print "根据索引排序,对于DataFrame可以指定轴。"
obj = Series(range(4), index=["d", "a", "b", "c"])
print obj.sort_index()
frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
print frame.sort_index()
print frame.sort_index(axis=1)  # axis=1 表示对列进行操作
print frame.sort_index(axis=1, ascending=False)  # 降序
print

print "根据值排序"
obj = Series([4, 7, -3, 2])
print obj.sort_values()  # order已淘汰
print

print "DataFrame指定列排序"
frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
print frame
print frame.sort_values(by="b")  # sort_index(by = ...)已淘汰
print frame.sort_values(by=["a", "b"])
print

print "rank,求排名的平均位置(从1开始)"
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 对应排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
print obj.rank()
print obj.rank(method="first")  # 去第一次出现,不求平均值。
print obj.rank(ascending=False, method="max")  # 逆序,并取最大值。所以-5的rank是7.
Пример #42
0
70 10000.0
80 304500.0
90 58000.0
100 53708.8
110 20308.0
nan 7000.0

# pandas에서 제공하는 isnull() 생각해서 풀기!

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

emp = pd.read_csv("c:/data/emp.csv")
deptno = Series(emp['DEPARTMENT_ID'].unique())
deptno=deptno.sort_values()
deptno
for i in deptno:
    if pd.isnull(i):
        print(i, emp.loc[emp['DEPARTMENT_ID'].isnull(), 'SALARY'].sum())
    else:
        print (int(i), emp.loc[emp['DEPARTMENT_ID']==i, 'SALARY'].sum())



###############################################################################
   
▣ pandas      
■ groupby: pandas에서 grouping을 통한 집계값 구하기 

import pandas as pd
Пример #43
0
    def test_sort_values(self, datetime_series):

        # check indexes are reordered corresponding with the values
        ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"])
        expected = Series([1, 2, 3, 4], ["D", "B", "A", "C"])
        result = ser.sort_values()
        tm.assert_series_equal(expected, result)

        ts = datetime_series.copy()
        ts[:5] = np.NaN
        vals = ts.values

        result = ts.sort_values()
        assert np.isnan(result[-5:]).all()
        tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:]))

        # na_position
        result = ts.sort_values(na_position="first")
        assert np.isnan(result[:5]).all()
        tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:]))

        # something object-type
        ser = Series(["A", "B"], [1, 2])
        # no failure
        ser.sort_values()

        # ascending=False
        ordered = ts.sort_values(ascending=False)
        expected = np.sort(ts.dropna().values)[::-1]
        tm.assert_almost_equal(expected, ordered.dropna().values)
        ordered = ts.sort_values(ascending=False, na_position="first")
        tm.assert_almost_equal(expected, ordered.dropna().values)

        # ascending=[False] should behave the same as ascending=False
        ordered = ts.sort_values(ascending=[False])
        expected = ts.sort_values(ascending=False)
        tm.assert_series_equal(expected, ordered)
        ordered = ts.sort_values(ascending=[False], na_position="first")
        expected = ts.sort_values(ascending=False, na_position="first")
        tm.assert_series_equal(expected, ordered)

        msg = "ascending must be boolean"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending=None)
        msg = r"Length of ascending \(0\) must be 1 for Series"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending=[])
        msg = r"Length of ascending \(3\) must be 1 for Series"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending=[1, 2, 3])
        msg = r"Length of ascending \(2\) must be 1 for Series"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending=[False, False])
        msg = "ascending must be boolean"
        with pytest.raises(ValueError, match=msg):
            ts.sort_values(ascending="foobar")

        # inplace=True
        ts = datetime_series.copy()
        return_value = ts.sort_values(ascending=False, inplace=True)
        assert return_value is None
        tm.assert_series_equal(ts,
                               datetime_series.sort_values(ascending=False))
        tm.assert_index_equal(
            ts.index,
            datetime_series.sort_values(ascending=False).index)

        # GH#5856/5853
        # Series.sort_values operating on a view
        df = DataFrame(np.random.randn(10, 4))
        s = df.iloc[:, 0]

        msg = ("This Series is a view of some other array, to sort in-place "
               "you must create a copy")
        with pytest.raises(ValueError, match=msg):
            s.sort_values(inplace=True)
Пример #44
0
# python script for learning to work with arrays and dataframes using pandas.
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
# series is 1-D array like object and associated array of data labels - index. Series is dataframe with one column
obj1 = Series([1,3,5,-6,2])
print "%s \n" % obj1
print "%s \n" % obj1.values
print "%s \n" % obj1.index

obj2 = Series([1,3,5,-6], index=["d","b","a","c"])
print "%s \n" % obj2
print "%s \n" % obj2["b"]
print "%s \n" % obj2["b":"c"]
# sort series by its values
print "%s \n" % obj2.sort_values()
# sort series by index
print "%s \n" % obj2.sort_index()

# create Series from dictionary
sdata = {"ohio":3500, "texas":7100, "oregon":1600, "utah":500}
sdata
obj3 = Series(sdata)
print "%s \n" % sdata
print "%s \n" % obj3

# Dataframe
data={"state":["ohio","ohio","nevada"],
      "year":[2000,2001,2002],
      "pop":[1.5,1.7,3.6]}
print "%s \n" % data
    def test_sort_values_categorical(self):

        c = Categorical(["a", "b", "b", "a"], ordered=False)
        cat = Series(c.copy())

        # sort in the categories order
        expected = Series(
            Categorical(["a", "a", "b", "b"],
                        ordered=False), index=[0, 3, 1, 2])
        result = cat.sort_values()
        tm.assert_series_equal(result, expected)

        cat = Series(Categorical(["a", "c", "b", "d"], ordered=True))
        res = cat.sort_values()
        exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
        tm.assert_numpy_array_equal(res.__array__(), exp)

        cat = Series(Categorical(["a", "c", "b", "d"], categories=[
                     "a", "b", "c", "d"], ordered=True))
        res = cat.sort_values()
        exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
        tm.assert_numpy_array_equal(res.__array__(), exp)

        res = cat.sort_values(ascending=False)
        exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(res.__array__(), exp)

        raw_cat1 = Categorical(["a", "b", "c", "d"],
                               categories=["a", "b", "c", "d"], ordered=False)
        raw_cat2 = Categorical(["a", "b", "c", "d"],
                               categories=["d", "c", "b", "a"], ordered=True)
        s = ["a", "b", "c", "d"]
        df = DataFrame({"unsort": raw_cat1,
                        "sort": raw_cat2,
                        "string": s,
                        "values": [1, 2, 3, 4]})

        # Cats must be sorted in a dataframe
        res = df.sort_values(by=["string"], ascending=False)
        exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(res["sort"].values.__array__(), exp)
        assert res["sort"].dtype == "category"

        res = df.sort_values(by=["sort"], ascending=False)
        exp = df.sort_values(by=["string"], ascending=True)
        tm.assert_series_equal(res["values"], exp["values"])
        assert res["sort"].dtype == "category"
        assert res["unsort"].dtype == "category"

        # unordered cat, but we allow this
        df.sort_values(by=["unsort"], ascending=False)

        # multi-columns sort
        # GH 7848
        df = DataFrame({"id": [6, 5, 4, 3, 2, 1],
                        "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
        df["grade"] = Categorical(df["raw_grade"], ordered=True)
        df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a'])

        # sorts 'grade' according to the order of the categories
        result = df.sort_values(by=['grade'])
        expected = df.iloc[[1, 2, 5, 0, 3, 4]]
        tm.assert_frame_equal(result, expected)

        # multi
        result = df.sort_values(by=['grade', 'id'])
        expected = df.iloc[[2, 1, 5, 4, 3, 0]]
        tm.assert_frame_equal(result, expected)
Пример #46
0
import numpy as np
from pandas import Series, DataFrame

print('根据索引排序,对于DataFrame可以指定轴。')
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
print(obj.sort_index())
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index=['three', 'one'],
                  columns=list('dabc'))
print(frame.sort_index())
print(frame.sort_index(axis=1))
print(frame.sort_index(axis=1, ascending=False))  # 降序

print('根据值排序')
obj = Series([4, 7, -3, 2])
print(obj.sort_values())  # order已淘汰

print('DataFrame指定列排序')
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by='b'))
print(frame.sort_values(by=['a', 'b']))

print('rank,求排名的平均位置(从1开始)')
obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
print(obj.rank(method='first'))  # 去第一次出现,不求平均值。
print(obj.rank(ascending=False, method='max'))
frame = DataFrame({
    'b': [4.3, 7, -3, 2],
    'a': [0, 1, 0, 1],
            if depth != 0:
                if 'l' in row[7].lower():
                    my_data[coc]['gw']['depth'].append(float(depth))
                    my_data[coc]['gw']['data'].append(float(row[6]))
                else:
                    my_data[coc]['so']['depth'].append(float(depth))
                    my_data[coc]['so']['data'].append(float(row[6]))

# Create a list of series to plot
plot_list = []
for key in my_data.keys():
    for media in my_data[key].keys():
        plot = Series(data=my_data[key][media]['data'],
                      index=my_data[key][media]['depth'],
                      name=str(key + '_' + media))
        plot = plot.sort_values('index')
        plot_list.append(plot)

rows = 4
cols = 12

fig = pylab.figure(figsize=(8, 6))
g_main = gridspec.GridSpec(rows, cols)
# Slug Test Table Test
columns = [
    'Date', 'Static Water\nLevel (ft bgs)', 'Slug Vol.\n(ft3)',
    'Screen Interval', 'Representative\nHydraulic Conductivity (m/d)'
]
row_label = ['Slug Test\nResults']
contents = [['Sep 16, 2017', 60, 0.1, '50 - 50.5', 1000]]
table_test = fig.add_subplot(g_main[3:, 7:])
Пример #48
0
#sorting series and dataframe
ser = Series([3,4,6,7,8,5,3,2], index=[2,3,5,7,6,9,4])
ser
ser.sort_index()

frame2 = frame2.reindex([2,1,0])
frame2
frame2.sort_index()

frame2 = frame2.reindex(columns = ["Speed","Humidity","Temp"])
frame2
frame2.sort_index(axis=1, ascending=False)

series2=Series([100,200,500,50],index=['S',['p','o','u']])
series2
series2.sort_values()

frame2.sort_value (by='Humidity')

#check for duplicate
series.index.is_unique
#sum
frame2.sum()
frame2.sum(axis=1)
frame2.idxmax()
frame2.idxmin()
#removing nan
from pandas import Series
import  numpy as np
ser = Series([1,2,3,4,np.nan],index=['a','b','c','d','e'])
ser
Пример #49
0
    def __init__(self, data: Series):

        self._data: Series = data.sort_values()
Пример #50
0
def plot_heatmap(dataframe,
                 vmin=None,
                 vmax=None,
                 cmap=None,
                 center=None,
                 robust=False,
                 annot=None,
                 fmt='.2g',
                 annot_kws=None,
                 linewidths=0,
                 linecolor='white',
                 cbar=False,
                 cbar_kws=None,
                 cbar_ax=None,
                 square=False,
                 xticklabels=True,
                 yticklabels=True,
                 mask=None,
                 data_type='continuous',
                 normalization_method=None,
                 normalization_axis=0,
                 max_std=3,
                 axis_to_sort=None,
                 cluster=False,
                 row_annotation=(),
                 column_annotation=(),
                 annotation_colors=(),
                 title=None,
                 xlabel=None,
                 ylabel=None,
                 xlabel_rotation=0,
                 ylabel_rotation=90,
                 filepath=None,
                 **kwargs):
    """
    Plot heatmap.
    :param dataframe:
    :param vmin:
    :param vmax:
    :param cmap:
    :param center:
    :param robust:
    :param annot:
    :param fmt:
    :param annot_kws:
    :param linewidths:
    :param linecolor:
    :param cbar:
    :param cbar_kws:
    :param cbar_ax:
    :param square:
    :param xticklabels:
    :param yticklabels:
    :param mask:
    :param data_type:
    :param normalization_method:
    :param normalization_axis:
    :param max_std:
    :param axis_to_sort:
    :param cluster:
    :param row_annotation:
    :param column_annotation:
    :param annotation_colors: list; a list of matplotlib color specifications
    :param title:
    :param xlabel:
    :param ylabel:
    :param xlabel_rotation:
    :param ylabel_rotation:
    :param filepath:
    :param kwargs:
    :return:
    """

    df = dataframe.copy()

    if normalization_method:
        df = normalize_2d_or_1d(df,
                                normalization_method,
                                axis=normalization_axis).clip(
                                    -max_std, max_std)
    values = unique(df.values)

    if len(row_annotation) or len(column_annotation):
        if len(row_annotation):
            if isinstance(row_annotation, Series):
                row_annotation = row_annotation.copy()
                if not len(row_annotation.index
                           & df.index):  # Series but without proper index
                    row_annotation.index = df.index
            else:
                row_annotation = Series(row_annotation, index=df.index)

            row_annotation.sort_values(inplace=True)
            df = df.ix[row_annotation.index, :]

        if len(column_annotation):
            if isinstance(column_annotation, Series):
                column_annotation = column_annotation.copy()
                if not len(column_annotation.index
                           & df.columns):  # Series but without proper index
                    column_annotation.index = df.columns
            else:
                column_annotation = Series(column_annotation, index=df.columns)

            column_annotation.sort_values(inplace=True)
            df = df.ix[:, column_annotation.index]

    if axis_to_sort in (0, 1):
        a = array(df)
        a.sort(axis=axis_to_sort)
        df = DataFrame(a, index=df.index)

    elif cluster:
        row_indices, column_indices = get_dendrogram_leaf_indices(dataframe)
        df = df.iloc[row_indices, column_indices]
        if isinstance(row_annotation, Series):
            row_annotation = row_annotation.iloc[row_indices]
        if isinstance(column_annotation, Series):
            column_annotation = column_annotation.iloc[column_indices]
    plt.figure(figsize=FIGURE_SIZE)

    gridspec = GridSpec(10, 10)

    ax_top = plt.subplot(gridspec[0:1, 2:-2])
    ax_center = plt.subplot(gridspec[1:8, 2:-2])
    ax_bottom = plt.subplot(gridspec[8:10, 2:-2])
    ax_left = plt.subplot(gridspec[1:8, 1:2])
    ax_right = plt.subplot(gridspec[1:8, 8:9])

    ax_top.axis('off')
    ax_bottom.axis('off')
    ax_left.axis('off')
    ax_right.axis('off')

    if not cmap:
        if data_type == 'continuous':
            cmap = CMAP_CONTINUOUS
        elif data_type == 'categorical':
            cmap = CMAP_CATEGORICAL
        elif data_type == 'binary':
            cmap = CMAP_BINARY
        else:
            raise ValueError(
                'Target data type must be one of {continuous, categorical, binary}.'
            )

    heatmap(df,
            vmin=vmin,
            vmax=vmax,
            cmap=cmap,
            center=center,
            robust=robust,
            annot=annot,
            fmt=fmt,
            annot_kws=annot_kws,
            linewidths=linewidths,
            linecolor=linecolor,
            cbar=cbar,
            cbar_kws=cbar_kws,
            cbar_ax=cbar_ax,
            square=square,
            ax=ax_center,
            xticklabels=xticklabels,
            yticklabels=yticklabels,
            mask=mask,
            **kwargs)

    if data_type == 'continuous':  # Plot colorbar
        cax, kw = make_axes(ax_bottom,
                            location='bottom',
                            fraction=0.16,
                            cmap=CMAP_CONTINUOUS,
                            norm=Normalize(values.min(), values.max()),
                            ticks=[values.min(),
                                   values.mean(),
                                   values.max()])
        ColorbarBase(cax, **kw)
        decorate(ax=cax)

    elif data_type in ('categorical', 'binary'):  # Plot category legends
        if len(values) < 30:
            horizontal_span = ax_center.axis()[1]
            vertival_span = ax_center.axis()[3]
            for i, v in enumerate(values):
                x = (horizontal_span / len(values) /
                     2) + i * horizontal_span / len(values)
                y = 0 - vertival_span * 0.09
                ax_center.plot(x,
                               y,
                               'o',
                               markersize=16,
                               aa=True,
                               clip_on=False)
                ax_center.text(x,
                               y - vertival_span * 0.05,
                               v,
                               horizontalalignment='center',
                               **FONT)

    decorate(title=title,
             xlabel=xlabel,
             ylabel=ylabel,
             xlabel_rotation=xlabel_rotation,
             ylabel_rotation=ylabel_rotation,
             max_xtick_size=10,
             ax=ax_center)

    if len(row_annotation):
        if len(set(row_annotation)) <= 2:
            cmap = CMAP_BINARY
        else:
            if len(annotation_colors):
                cmap = ListedColormap(annotation_colors)
            else:
                cmap = CMAP_CATEGORICAL
        heatmap(DataFrame(row_annotation),
                ax=ax_right,
                cbar=False,
                xticklabels=False,
                yticklabels=False,
                cmap=cmap)

    if len(column_annotation):
        if len(set(column_annotation)) <= 2:
            cmap = CMAP_BINARY
        else:
            if len(annotation_colors):
                cmap = ListedColormap(annotation_colors)
            else:
                cmap = CMAP_CATEGORICAL
        heatmap(DataFrame(column_annotation).T,
                ax=ax_top,
                cbar=False,
                xticklabels=False,
                yticklabels=False,
                cmap=cmap)

    save_plot(filepath)
Пример #51
0
def histogram_skewed_normal_distributions(ax: plt.Axes, series: pd.Series, var: str):
    values = series.sort_values().values
    n, bins, patches = ax.hist(values, 10, density=True, edgecolor='grey')
    distributions = compute_skewed_normal_distributions(values, bins)
    multiple_line_chart(ax, values, distributions, 'Best fit for %s' % var, var, 'probability')
'''
C    0
A    1
B    2
dtype: int64
'''

print(ser1.sort_index())
'''
A    1
B    2
C    0
dtype: int64
'''

print(ser1.sort_values())
'''
C    0
A    1
B    2
dtype: int64
'''

# ranking
from numpy.random import randn

ser2 = Series(randn(10))
print(ser2)
'''
0    1.290655
1    1.383484
Пример #53
0
import pandas as pd
from pandas import Series, DataFrame

ser1 = Series(range(3), index = ['C','A','B'])
ser1

# use sort index to sort by index
ser1.sort_index()

# use order to sort by values
ser1.order()

from numpy.random import randn
ser2 = Series(randn(10))
ser2

# ranking
ser2.sort_values()

ser2.rank()

ser2.sort_values(ascending = False)

ser3 = Series(randn(10))
ser3

ser3.rank()
ser3 = ser3.sort_values()
ser3.rank()

Пример #54
0
def plot_cdf(s: pd.Series):
    s_sorted = s.sort_values()
    n = s.shape[0]
    f = np.arange(n, dtype=float) / n
    plt.plot(s_sorted, f, label=s.name)