Exemplo n.º 1
0
def eod_position_pd_run(pricing_environment, valuation_date):
    r = utils.get_redis_conn(redis_ip)
    position_result = r.get(EOD_BASIC_POSITIONS)
    position = pd.read_msgpack(position_result)

    risk_result = r.get(EOD_BASIC_RISKS_ + pricing_environment.lower())
    risk = pd.read_msgpack(risk_result)

    cash_flow_result = r.get(EOD_BASIC_CASH_FLOW)
    cash_flow = pd.read_msgpack(cash_flow_result)

    listed_option_positions_result = r.get(EOD_BASIC_LISTED_OPTION_POSITION_ +
                                           pricing_environment.lower())
    listed_option_positions = pd.read_msgpack(listed_option_positions_result)

    headers = utils.login(data_resource_ip, login_body)

    rpt = []
    if not position.empty:
        rpt = eod_position_report_pd.eod_position_report(
            position, risk, cash_flow, listed_option_positions,
            pricing_environment, data_resource_ip, headers, valuation_date)
    position_result = JSONEncoder().encode(rpt)
    r.set(EOD_CUSTOM_POSITION_ + pricing_environment.lower(),
          str(position_result))
Exemplo n.º 2
0
def msgpack_assertMeta(filename, frames=None, redo=False):
    '''Asserts that the .meta file for a given .msg file exists and returns the data in the .meta file once it exists'''
    meta_out_file = filename.replace(".msg", ".meta")
    print(meta_out_file)
    meta_frames = None
    if (os.path.exists(meta_out_file) and not redo):
        #Need to check for latin encodings due to weird pandas default
        try:
            meta_frames = pd.read_msgpack(meta_out_file)
        except UnicodeDecodeError as e:
            meta_frames = pd.read_msgpack(meta_out_file, encoding='latin-1')
    if (meta_frames == None):
        if (frames == None):
            print(
                "Bulk reading .msg for metaData assertion. Be patient, reading in slices not supported."
            )
            print(filename)
            #Need to check for latin encodings due to weird pandas default
            try:
                frames = pd.read_msgpack(filename)
            except UnicodeDecodeError as e:
                frames = pd.read_msgpack(filename, encoding='latin-1')
        meta_frames = {"NumValues": frames["NumValues"]}

    if (not os.path.exists(meta_out_file) or redo):
        pd.to_msgpack(meta_out_file, meta_frames)

    return meta_frames
Exemplo n.º 3
0
    def recommend_by_shop(self, start_date, end_date):
        start_date = DateStrToDate(start_date)
        end_date = DateStrToDate(end_date, hour=23, minute=59, seconds=59)
        case_data = pd.read_msgpack(red.get('case_data'))
        payment_logs = pd.read_msgpack(red.get('payment_logs'))
        shop_reflect = json.loads(red.get('shop_data').decode())
        result_list = []

        data = case_data[case_data['apply_date'] <= end_date]
        payment_limit = payment_logs[(payment_logs['date'] >= start_date)
                                     & (payment_logs['date'] < end_date)]

        data_excute = Data_Execute()
        date_index = "%s/%s" % (start_date.strftime('%Y-%m-%d'),
                                end_date.strftime('%m-%d'))

        for shop_id in shop_reflect:
            shop_data = data_excute.recommend_by_shop(
                table=data[data['shop_id'] == int(shop_id)],
                payment_logs=payment_limit[payment_limit['shop_id'] == int(
                    shop_id)],
                date_index=date_index,
                data_name=shop_reflect[str(shop_id)],
                start_date=start_date)
            result_list.append(shop_data)
        return result_list
Exemplo n.º 4
0
def learn(basepath, features_file, labels_file):
    # Load the data
    print 'Loading data...'
    features_data = pd.read_msgpack(load_data(basepath, features_file)['data'])
    labels_data = pd.read_msgpack(load_data(basepath, labels_file)['data'])
    df = pd.concat([features_data, labels_data], axis=1)

    # Process features
    samples, labels = preprocess.process_data(df)

    # How many samples are we going to leave out for the test set?
    nb_test = int(len(labels) * 0.2)
    split = len(labels) - nb_test

    # Prepare training and test sets
    X_train = np.array(samples[:split])
    y_train = labels[:split]
    X_test = np.array(samples[split + 1:])
    y_test = labels[split + 1:]
    print len(X_train), 'train sequences'
    print len(X_test), 'test sequences'

    # How many classes?
    num_classes = np.max(labels) + 1
    print num_classes, 'classes'

    # Train Model
    train_and_save(X_train, X_test, y_train, y_test, num_classes, basepath)
Exemplo n.º 5
0
def learn(basepath, features_file, labels_file):
    # Load the data
    print 'Loading data...'
    features_data = pd.read_msgpack(load_data(basepath, features_file)['data'])
    labels_data = pd.read_msgpack(load_data(basepath, labels_file)['data'])
    df = pd.concat([features_data,labels_data], axis=1)

    # Process features
    samples, labels = preprocess.process_data(df)

    # How many samples are we going to leave out for the test set?
    nb_test = int(len(labels) * 0.2)
    split = len(labels) - nb_test

    # Prepare training and test sets
    X_train = np.array(samples[:split])
    y_train = labels[:split]
    X_test = np.array(samples[split+1:])
    y_test = labels[split+1:]
    print len(X_train), 'train sequences'
    print len(X_test), 'test sequences'

    # How many classes?
    num_classes = np.max(labels)+1
    print num_classes, 'classes'

    # Train Model
    train_and_save(X_train, X_test, y_train, y_test, num_classes, basepath)
Exemplo n.º 6
0
def concat_data():
    df_ohlc = pd.read_msgpack(f'{DATASET_PATH}/twse_ohlc.msgpack').rename(
        columns={
            '日期': 'date',
            '開盤指數': 'open',
            '最高指數': 'high',
            '最低指數': 'low',
            '收盤指數': 'close'
        }).reset_index(drop=True)

    df_ob = pd.read_msgpack(f'{DATASET_PATH}/twse_orderbook.msgpack')

    df_ob_open = df_ob[df_ob['時間'].map(
        lambda x: x.time() == dt.time(9, 0, 0))].copy()
    df_ob_close = df_ob[df_ob['時間'].map(
        lambda x: x.time() == dt.time(13, 30, 0))].copy()

    df_ob_open['date'] = df_ob_open['時間'].map(
        lambda x: x.date())  #.strftime('%Y%m%d')
    df_ob_open = df_ob_open.rename(columns={
        '累積委託買進筆數': 'order_buy',
        '累積委託賣出筆數': 'order_sell'
    })[['date', 'order_buy', 'order_sell']]

    df_ob_close['date'] = df_ob_close['時間'].map(lambda x: x.date())
    df_ob_close = df_ob_close.rename(columns={'累積成交金額': 'volume'})[[
        'date', 'volume'
    ]]

    df_twse = df_ohlc.set_index('date').join(
        df_ob_open.set_index('date')).join(
            df_ob_close.set_index('date')).dropna()
    print('data concat done.')
    return df_twse
Exemplo n.º 7
0
    def load(self, fn):
        self.xs_not_decomposed = pd.read_msgpack(fn + '.X.msg')
        self.xs_not_decomposed = self.xs_not_decomposed.to_dict()
        self.ys_not_decomposed = pd.read_msgpack(fn + '.Y.msg')
        self.idx = self.ys_not_decomposed.index.max()
        self.ys_not_decomposed = self.ys_not_decomposed.to_dict()

        self.start()
Exemplo n.º 8
0
 def main():
     with TLOG('read data'):
         df = pd.read_msgpack('data/z6_ts_events.msgpack')
         df_discount = pd.read_msgpack('data/z6_ts_discount.msgpack')
         ids = np.load('data/z6_ts_user_id_merchant_id.npy')
     feature = UserMerchantFeature(df, df_discount=df_discount, keys=ids)
     df = feature.process()
     df.to_msgpack('data/z6_ts_feature_user_merchant.msgpack')
Exemplo n.º 9
0
def Files_Read_Qo_Q190(link, listEvents=False, umbral=0.5, MinDays=15):
    #Read observed
    try:
        Qo = pd.read_msgpack('/Users/nicolas/LambdaExp/BaseData/USGS/' + link +
                             '.msg')
    except:
        #Read information from the links
        LinksData = pd.read_msgpack('LinkData.msg')
        USGS_id = LinksData.index[LinksData['Link'] == int(link)].values[0]
        #Read from the web.
        print('Warning: reading from the web...')
        Qo = db.WEB_Get_USGS(USGS_id, '2008-01-01', '2018-12-30')
        Qo.to_msgpack('/Users/nicolas/LambdaExp/BaseData/USGS/' + link +
                      '.msg')
        print('Message: Streamflow saved as a msgpack as link:' + link)
    Qo = Qo.resample('H').mean()
    #Read simulated
    Qs = pd.read_msgpack('/Users/nicolas/LambdaExp/BaseData/HLM190/' + link +
                         '.msg')
    Qs = Qs.resample('H').mean()
    #Find events
    shared = Qo.index.intersection(Qs.index)
    Qs = Qs[shared]
    Qo = Qo[shared]
    pos1, pos2 = ser.Runoff_FindEvents(Qo, Qs, umbral=umbral)
    #Estimates the max anual Streeamflow
    QmaxA = np.median(Qo.resample('A').max())
    Qbase = Qo.resample('A').apply(
        lambda x: np.percentile(x[np.isfinite(x)], 50)).mean()
    #Selects only the good events
    pos1V2 = []
    pos2V2 = []
    for i, j in zip(pos1, pos2):
        M = Qo[i:j].max()
        if M > QmaxA * umbral:
            #Check amount of nans.
            NaNPercent = Qo[i:j][np.isnan(Qo[i:j])].size / Qo[i:j].size
            if NaNPercent < 0.2:
                #Check the time between peak and strt of the event
                while Qo[i] < Qbase * 2.5:
                    i = i + pd.Timedelta('5h')
            Td = j - i
            if Td.days < MinDays:
                pos1V2.append(i)
                pos2V2.append(j)
    #List the events
    if listEvents:
        c = 0
        for p1, p2 in zip(pos1V2, pos2V2):
            qp = '%.2f ' % np.nanmax(Qo[p1:p2])
            print(c, qp, p1)
            c += 1
    #Updates the class Evento
    Evento.Qobs = Qo
    Evento.Qsim = Qs
    Evento.pos1 = pos1V2
    Evento.pos2 = pos2V2
Exemplo n.º 10
0
 def load(self, only_get=None):
     train_data = pandas.read_msgpack('cache/dataset/train.msg')
     merge_data = pandas.read_msgpack('cache/dataset/merge.msg')
     valid_data = pandas.read_msgpack('cache/dataset/valid.msg')
     if only_get is not None:
         return train_data.head(only_get), merge_data.head(
             only_get), valid_data.head(only_get)
     else:
         return train_data, merge_data, valid_data
Exemplo n.º 11
0
    def _load(self, as_df):
        with gzip.open('rf_cache/dataset/train.msg.gz', 'r') as f:
            if as_df:
                td = pandas.read_msgpack(f)
            else:
                td = pandas.read_msgpack(f).values

        f = np.load('rf_cache/folds.npz')['data']

        return td, f
Exemplo n.º 12
0
 def read_events(self):
     split = self.split.name
     with TLOG('read events'):
         df_events = pd.read_msgpack(f'data/z6_ts_{split}_events.msgpack')
         df_discount = pd.read_msgpack(
             f'data/z6_ts_{split}_discount.msgpack')
     df = with_discount(df_events, df_discount)
     with TLOG('build index events'):
         events = IndexedEvents(df, key_column=self.key_column)
     return events
Exemplo n.º 13
0
 def main():
     with TLOG('read data'):
         df = pd.read_msgpack('data/z6_ts_events.msgpack')
         df_discount = pd.read_msgpack('data/z6_ts_discount.msgpack')
         coupon_ids = np.load('data/z6_ts_coupon_id.npy')
     coupon_feature = CouponFeature(df,
                                    df_discount=df_discount,
                                    keys=coupon_ids)
     df = coupon_feature.process()
     df.to_msgpack('data/z6_ts_feature_coupon.msgpack')
Exemplo n.º 14
0
 def read_input_to_pandas(self, columnList=[], indexCol="Sample"):
     if self.isGzipped:
         with gzip.open(self.filePath) as path:
             df = pd.read_msgpack(path)
     else:
         df = pd.read_msgpack(self.filePath)
     df = df.reset_index()
     if len(columnList) > 0:
         df = df[columnList]
     return df
Exemplo n.º 15
0
def get_human_22_fake_genome():
    from mbf_genomics.testing import MockGenome
    import gzip

    genes = pd.read_msgpack(
        gzip.GzipFile(get_sample_data(Path("mbf_align/hs_22_genes.msgpack.gz")))
    ).reset_index()
    tr = pd.read_msgpack(
        gzip.GzipFile(get_sample_data(Path("mbf_align/hs_22_transcripts.msgpack.gz")))
    ).reset_index()
    return MockGenome(df_genes=genes, df_transcripts=tr, chr_lengths={"22": 50_818_468})
def get_translational_efficiency(project_id):
    from main import get_db
    rdb = get_db()
    rp = rdb.get("{}_rpkm_rp".format(project_id))
    rna = rdb.get("{}_rpkm_rna".format(project_id))
    list_of_samples = []
    if rp is None or rna is None:
        return render_template("translational_efficiency.html", samples=list_of_samples,
                               error="No data for project: {}".format(project_id))
    rp_df = pd.read_msgpack(rp)
    rna_df = pd.read_msgpack(rna)
    samples = list(rp_df.columns)
    samples.remove('gene_name')
    list_of_samples = samples
    if request.method == "GET":
        return render_template("translational_efficiency.html", samples=list_of_samples)

    selected_samples = request.form.getlist('selected_samples')
    if not selected_samples:
        return render_template("translational_efficiency.html", samples=list_of_samples, error="No samples selected")

    apply_filter = request.form.get('apply_filter') == "True"
    min_y = request.form.get('min_y', -100)
    max_y = request.form.get('max_y', 100)
    min_y = int(min_y)
    max_y = int(max_y)
    plot_series = []
    for sample in selected_samples:
        gene_names = rp_df['gene_name'].tolist()
        rp = rp_df[sample].astype(float).tolist()
        rna = rna_df[sample].astype(float).tolist()
        df = pd.DataFrame(columns=['gene_name', 'x', 'y'])
        df['gene_name'] = gene_names
        df['rpkm_rna'] = rna
        df['rpkm_rp'] = rp
        df['log2(rp)'] = np.log2(df['rpkm_rp'])
        df['log2(rna)'] = np.log2(df['rpkm_rna'])
        df['x'] = df['log2(rna)']
        df['y'] = df['log2(rna)'] / df['log2(rp)']

        if apply_filter:
            df = df.loc[df['y'] >= min_y]
            df = df.loc[df['y'] <= max_y]
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.dropna()
        series = {
            'name': sample,
            'data': df.to_dict('records')
        }
        plot_series.append(series)
    return render_template("translational_efficiency.html", samples=list_of_samples, selected_samples=selected_samples,
                           apply_filter=apply_filter, min_y=min_y, max_y=max_y, plot_series=plot_series)
Exemplo n.º 17
0
def FeedToRedis(r=r, source_dir="intermediate-results/", mode="batch"):
    if not mode in ["batch", "recent"]:
        print "[Error] the mode is not correct!"
        exit()

    result_filename = "mirror-news-ann-distance-20.result"
    msg_filename = "news-id-tfidf50-topic-category.msg"
    if mode == "recent":
        result_filename = "recent-" + result_filename
        msg_filename = "recent-" + msg_filename

    if os.path.exists(source_dir + result_filename) == True:
        f = open(source_dir + result_filename, 'r')
    else:
        print(
            "[Warning] Cannot find the latest list of related news. Use the fallback list now. Please run daily_batch.sh to get the latest related news"
        )
        f = open('fallback/fallback.result', 'r')

    if os.path.exists(source_dir + msg_filename) == True:
        df = pd.read_msgpack(source_dir + msg_filename)
    else:
        print(
            "[Warning] Cannot find the latest metadata of related news. Use the fallback metadata now. Please run daily_batch.sh to get the latest metadata"
        )
        df = pd.read_msgpack('fallback/fallback.msg')

    print("Loading the KNN list...")

    news_dict = dict()
    for line in f:
        news_id, knn_raw = line.replace("\n", "").split("\t")
        knn_list = json.loads(knn_raw)

        r_news = []
        for (r_id, _) in knn_list:
            r_dict = get_facets(df, r_id)
            r_news.append(r_dict)

        n_dict = get_facets(df, news_id)
        n_dict['knn_list'] = knn_list
        n_dict['related_news'] = r_news

        news_dict["related-news-v2-" + news_id] = json.dumps(n_dict)
    """
    if you find error msg: MISCONF Redis is configured to save RDB snapshots, 
    try this on redis-cli: config set stop-writes-on-bgsave-error no
    """
    print "Total: " + str(len(news_dict))
    print "Feed all to Redis..."
    r.mset(news_dict)
    print "Done!"
Exemplo n.º 18
0
def histogram(wild_card='*thrown', key='energy', bins=10):
    paths = glob(wild_card)
    df = pd.read_msgpack(paths[0])
    df.dropna(inplace=True)
    counts, bins = np.histogram(df[key], bins=bins)
    total_bincounts = np.zeros(len(bins) - 1)

    for path in paths:
        df = pd.read_msgpack(path)
        df.dropna(inplace=True)
        bincounts = np.histogram(df[key], bins=bins)[0]
        total_bincounts += bincounts
    return total_bincounts, bins
Exemplo n.º 19
0
def load(file_name, mode=None):
    if mode == 'HD5':
        df = pd.read_hdf(file_name, 'dataframe')
    elif mode == 'msgpack':
        pd.read_msgpack(file_name)
    elif mode == 'parquet':
        df = pd.read_parquet(file_name,
                             engine='pyarrow',
                             use_pandas_metadata=True)
    elif mode == 'pickle.gzip':
        df = pd.read_pickle(file_name, compression='gzip')
    elif mode == 'feather':
        read_feather(file_name)
Exemplo n.º 20
0
 def msgpack_revert(self, val, cls=dict, checker=None, **kw):
     if cls == pandas_df_type.__name__ or cls == pandas_df_type:
         import pandas
         if isinstance(val, requests.Response):
             return pandas.read_msgpack(val.content, **keep(kw, []))
         else:
             return pandas.read_msgpack(io.BytesIO(val), **keep(kw, []))
     if isinstance(val, requests.Response):
         val = val.content
     import msgpack
     val = msgpack.loads(val, **keep(kw, []))
     val = self.schema_revert(val, cls, checker=checker)
     return val
Exemplo n.º 21
0
    def Statistic_index(self, statistic_date, statistic_date_end, compare_date,
                        compare_date_end, shop_get):
        statistic_date = DateStrToDate(statistic_date)
        statistic_end_date = DateStrToDate(statistic_date_end,
                                           hour=23,
                                           minute=59,
                                           seconds=59)
        compare_date = DateStrToDate(compare_date)
        compare_end_date = DateStrToDate(compare_date_end,
                                         hour=23,
                                         minute=59,
                                         seconds=59)
        if int(shop_get) == 0 or not shop_get:
            shop_ids = [i for i in range(2, 24)]
        else:
            shop_ids = [int(shop_get)]
        case_data = pd.read_msgpack(red.get('case_data'))
        payment_logs = pd.read_msgpack(red.get('payment_logs'))
        data_execute = Data_Execute()

        statistic_data = case_data[
            (case_data['apply_date'] >= statistic_date)
            & (case_data['apply_date'] < statistic_end_date) &
            (case_data['shop_id'].isin(shop_ids))]
        statistic_compare = case_data[
            (case_data['apply_date'] >= compare_date)
            & (case_data['apply_date'] < compare_end_date) &
            (case_data['shop_id'].isin(shop_ids))]

        payment_data = payment_logs[(payment_logs['date'] >= statistic_date) &
                                    (payment_logs['date'] < statistic_end_date)
                                    & (payment_logs['shop_id'].isin(shop_ids))]
        payment_compare = payment_logs[
            (payment_logs['date'] >= compare_date)
            & (payment_logs['date'] < compare_end_date) &
            (payment_logs['shop_id'].isin(shop_ids))]

        statistic = data_execute.index_stactic(
            statistic_data,
            1,
            statistic_date.strftime('%Y-%m-%d'),
            index=True,
            payment_logs=payment_data)
        compare = data_execute.index_stactic(statistic_compare,
                                             2,
                                             compare_date.strftime('%Y-%m-%d'),
                                             index=True,
                                             payment_logs=payment_compare)
        result_list = [statistic, compare]
        return result_list
Exemplo n.º 22
0
    def main(cls, split):
        df_raw_offline = pd.read_msgpack(f'data/z1_raw_offline.msgpack')
        df_raw_test = pd.read_msgpack(f'data/z1_raw_test.msgpack')
        user_id_index = np.unique(
            np.concatenate([
                df_raw_offline['user_id'].unique(),
                df_raw_test['user_id'].unique()
            ]))
        events = cls(user_id_index)
        LOG.info('events feed_offline')
        events.feed_offline(df_raw_offline)
        df_online_coupon = pd.read_msgpack(
            f'data/z1_raw_online_coupon.msgpack')
        LOG.info('events feed_online_coupon')
        events.feed_online_coupon(df_online_coupon)
        df_online_click = pd.read_msgpack(f'data/z1_raw_online_click.msgpack')
        LOG.info('events feed_online_click')
        events.feed_online_click(df_online_click)
        LOG.info('events feed_test')
        events.feed_test(df_raw_test)
        LOG.info('events to_frame')
        df = events.to_frame(split)
        df.to_msgpack(f'data/z6_ts_{split.name}_events.msgpack')

        LOG.info('build_discount_table')
        df_discount = cls.build_discount_table(df)
        df_discount.to_msgpack(f'data/z6_ts_{split.name}_discount.msgpack')

        df_offline_events = df[df['event_type'].isin([
            'offline_receive_coupon',
            'offline_buy_with_coupon',
            'offline_buy_without_coupon',
        ])]
        LOG.info('build_index_of user_id')
        user_id_index = cls.build_index_of(df_offline_events, 'user_id')
        np.save(f'data/z6_ts_{split.name}_user_id.npy', user_id_index)

        for key in ['merchant_id', 'coupon_id']:
            LOG.info('build_index_of {}', key)
            arr = cls.build_index_of(df_offline_events, key)
            np.save('data/z6_ts_{}_{}.npy'.format(split.name, key), arr)

        LOG.info('build_index_of user_id_merchant_id')
        arr = cls.build_index_of(df_offline_events, ['user_id', 'merchant_id'])
        np.save(f'data/z6_ts_{split.name}_user_id_merchant_id.npy', arr)

        LOG.info('build_index_of user_id_coupon_id')
        arr = cls.build_index_of(df_offline_events, ['user_id', 'coupon_id'])
        np.save(f'data/z6_ts_{split.name}_user_id_coupon_id.npy', arr)
Exemplo n.º 23
0
def histogram2d(wild_card='*thrown', x='energy', y='number_photons', bins=10):
    paths = glob(wild_card)
    df = pd.read_msgpack(paths[0])
    df.dropna(inplace=True)
    h = np.histogram2d(x=df[x], y=df[y], bins=bins)
    total_counts = h[0]
    bins = h[1]
    total_counts = 0

    for path in paths:
        df = pd.read_msgpack(path)
        df.dropna(inplace=True)
        bincounts = np.histogram2d(x=df[x], y=df[y], bins=bins)[0]
        total_counts += bincounts
    return total_counts, bins
Exemplo n.º 24
0
def _getStore(f, storeType):
    '''Helper Function - Gets the HDFStore or frames for the file and storeType'''
    frames = None
    if (storeType == "hdf5"):
        store = pd.HDFStore(f)
    elif (storeType == "msgpack"):
        print(
            "Bulk reading .msg. Be patient, reading in slices not supported.")
        sys.stdout.flush()
        #Need to check for latin encodings due to weird pandas default
        try:
            frames = pd.read_msgpack(f)
        except UnicodeDecodeError as e:
            frames = pd.read_msgpack(f, encoding='latin-1')
    return store, frames
Exemplo n.º 25
0
def read_dataset():
    with TLOG('read dataframes'):
        df_test_full = pd.read_msgpack(
            f'data/z6_ts_{SPLIT.name}_merged_test.msgpack')
        df_train_full = pd.read_msgpack(
            f'data/z6_ts_{SPLIT.name}_merged_train.msgpack')
    # columns
    features = list(
        df_test_full.columns.difference([
            'user_id',
            'merchant_id',
            'coupon_id',
            'discount_name',
            'date',
            'label',
        ]))
    features = GOOD_FEATURES
    # features = list(
    #     set(itertools.chain(*FEATULE_LEVELS.values()))
    #     - set(BAD_FEATURES)
    # )
    print(pretty(features))
    test_submit_cols = ['user_id', 'coupon_id', 'date']
    if TEST_HAS_LABEL:
        test_submit_cols += ['label']
    train_submit_cols = ['user_id', 'coupon_id', 'date', 'label']
    # test submit
    df_test = df_test_full[features]
    df_submit = format_date(df_test_full.loc[df_test.index, test_submit_cols])
    LOG.info('df_test {}', df_test.shape)
    LOG.info('df_submit {}', df_submit.shape)
    # split train validate
    mask = np.random.rand(len(df_train_full)) < 0.05
    df_validate = df_train_full.loc[mask, features + ['label']]
    df_validate_submit = format_date(df_train_full.loc[mask,
                                                       train_submit_cols])
    df_train = df_train_full.loc[~mask, features + ['label']]
    df_train_submit = format_date(df_train_full.loc[~mask, train_submit_cols])
    LOG.info('df_train {}', df_train.shape)
    LOG.info('df_train_submit {}', df_train_submit.shape)
    LOG.info('df_validate {}', df_validate.shape)
    LOG.info('df_validate_submit {}', df_validate_submit.shape)

    df_train_x, df_train_y = split_feature_label(df_train)
    df_validate_x, df_validate_y = split_feature_label(df_validate)

    ret = df_train_x, df_train_y, df_validate_x, df_validate_y, df_test, df_submit, df_validate_submit, df_train_submit
    return [x.copy() for x in ret]
Exemplo n.º 26
0
    def read_all(self, freq, **kwargs):
        """
            Read the entire timeseries record for all matching timeseries instances.
            Optionally exclude timeseries from the final DataFrame by specifying IDs in the exclude argument.

            :param identifier: Identifier of the timeseries.
            :type identifier: string
            :param freq: Timeseries data frequency.
            :type freq: string
            :param excludes: IDs of timeseries to exclude from final DataFrame.
            :type excludes: array[string]
            :param kwargs: Attributes to match against timeseries instances (e.g. source, measurand).
            :type kwargs: kwargs

            :returns: pandas.DataFrame -- Timeseries data.
        """
        url = self.__attach_kwargs_to_url(
                self.server + '/read_all/{0}.{1}'.format(freq, self.format),
                kwargs
            )

        if self.format == 'msgpack':
            return pd.read_msgpack(urlopen(url))
        elif self.format == 'json':
            return pd.read_json(urlopen(url))
        else:
            raise NotImplementedError('Unsupported format: {0}'.format(self.format))
 def _load_chunks(self):
     for chunk in pandas.read_msgpack(self.buf, iterator=True):
         for col in self.schema.cols:
             if isinstance(col, big_dt):
                 # converting big_dt column
                 chunk[col.name] = chunk[col.name].map(datetime.datetime.fromtimestamp, na_action='ignore')
         yield chunk
Exemplo n.º 28
0
def testrun(gname):

    method = 'tepitope'  #'iedbmhc1'#'netmhciipan'
    path = 'test'
    gfile = os.path.join(genomespath, '%s.gb' % gname)
    df = sequtils.genbank2Dataframe(gfile, cds=True)
    #names = list(df.locus_tag[:1])
    names = ['VP24']
    alleles1 = [
        "HLA-A*02:02", "HLA-A*11:01", "HLA-A*32:07", "HLA-B*15:17",
        "HLA-B*51:01", "HLA-C*04:01", "HLA-E*01:03"
    ]
    alleles2 = [
        "HLA-DRB1*0101", "HLA-DRB1*0305", "HLA-DRB1*0812", "HLA-DRB1*1196",
        "HLA-DRB1*1346", "HLA-DRB1*1455", "HLA-DRB1*1457", "HLA-DRB1*1612",
        "HLA-DRB4*0107", "HLA-DRB5*0203"
    ]
    P = base.getPredictor(method)
    P.iedbmethod = 'IEDB_recommended'  #'netmhcpan'
    P.predictProteins(df,
                      length=11,
                      alleles=alleles2,
                      names=names,
                      save=True,
                      path=path)
    f = os.path.join('test', names[0] + '.mpk')
    df = pd.read_msgpack(f)
    P.data = df
    #b = P.get_binders(data=df)
    #print b[:20]
    base.getScoreDistributions(method, path)
    return
Exemplo n.º 29
0
def read_python2_hdf5_dataframe(h5_filepath, key):
    h5_filepath = os.path.realpath(h5_filepath)

    msgpack_filepath = h5_filepath + '.' + key.replace('/', '_') + '.msgpack'

    filepath_time = os.path.getmtime(h5_filepath)

    if not os.path.exists(
            msgpack_filepath) or filepath_time > os.path.getmtime(
                msgpack_filepath):
        logging.info(
            'msgpack file {} doesnt exists, creating'.format(msgpack_filepath))
        convert_python2_hdf5_to_msgpack(h5_filepath, key, msgpack_filepath)

    else:
        logging.info('msgpack file {} exists'.format(msgpack_filepath))

    data = pd.read_msgpack(msgpack_filepath)

    # Fix columns names and string columns that are bytes
    data.columns = data.columns.astype(str)
    for col in data:
        try:
            newcol = data[col].str.decode('utf-8')
        except AttributeError:
            continue
        if not newcol.isnull().any():
            data[col] = newcol

    return data
Exemplo n.º 30
0
def getScoreDistributions(method, path):
    """Get global score distributions and save quantile values for each allele
       Assumes all the files in path represent related proteins"""

    files = glob.glob(os.path.join(path, '*.mpk'))
    results = []
    P = getPredictor(method)
    key = P.scorekey
    #if method == 'iedbmhc1':
    #    P.data = pd.read_msgpack(files[0])
    #    key = P.getScoreKey()
    print key
    for f in files[:200]:
        df = pd.read_msgpack(f)
        #df = df.dropna()
        x = df.pivot_table(index='peptide', columns='allele', values=key)
        #print x[:5]
        results.append(x)
    result = pd.concat(results)
    percs = np.arange(0.01,1,0.01)
    bins = result.quantile(percs)
    #reverse is best values are lower
    if P.operator == '<':
        bins.index = pd.Series(bins.index).apply(lambda x: 1-x)
    outfile = os.path.join(path,'quantiles.csv')
    print outfile
    bins.to_csv(outfile,float_format='%.3f')
    df= pd.read_csv(outfile,index_col=0)
    print df.ix[0.96]
    return
Exemplo n.º 31
0
def load_pandas(file_name='review.json', use_cache=True):
    cache_path = os.path.join(CACHE_PATH, f'load_pandas.msgpack')
    if use_cache and os.path.exists(cache_path):
        print(f'Loading from {cache_path}')
        ratings, user_counts, active_users = pd.read_msgpack(cache_path)
        print(f'Loaded from {cache_path}')
    else:
        line_count = len(
            open(os.path.join(EXCEL_PATH, file_name),
                 encoding='utf8').readlines())
        user_ids, business_ids, stars, dates, text = [], [], [], [], []
        with open(os.path.join(EXCEL_PATH, file_name), encoding='utf8') as f:
            for line in tqdm(f, total=line_count):
                blob = json.loads(line)
                user_ids += [blob["user_id"]]
                business_ids += [blob["business_id"]]
                stars += [blob["stars"]]
                dates += [blob["date"]]
                text += [blob["text"]]

        ratings = pd.DataFrame({
            "user_id": user_ids,
            "business_id": business_ids,
            "rating": stars,
            "text": text,
            "date": dates
        })
        user_counts = ratings["user_id"].value_counts()
        active_users = user_counts.loc[user_counts >= 5].index.tolist()

        pd.to_msgpack(cache_path, (ratings, user_counts, active_users))
        print(f'Dumping to {cache_path}')
    return ratings, user_counts, active_users
Exemplo n.º 32
0
 def read_df_from_redis(self, redisConn, key):
   try:
     return pd.read_msgpack(redisConn.get(key))
   except:
     if DEBUG:
       traceback.print_exc()
     return -1
Exemplo n.º 33
0
def test_mse(neighborhood_size=5, filtertype="collaborative filtering"):
    """Tests the mse of predictions based on a given number of neighborhood sizes

    neighborhood_size -- the sizes of neighborhoods between the number and 1 (so 5 tests for neighborhood of length 1, 2, 3, 4, 5)
    filtertype -- the type of similarity you want to test the mse of   
    """
    # init variables
    all_df = helpers.json_to_df()
    df = helpers.split_data(all_df)
    ut = helpers.create_utility_matrix(df[0])

    if filtertype == "collaborative filtering":
        print("Creating needed variables...")
        sim = helpers.similarity_matrix_cosine(ut)
    elif filtertype == "content based":
        print("Creating needed variables...")
        cats = helpers.json_to_df_categories()
        fancy_cats = helpers.extract_genres(cats)
        ut_cats = helpers.pivot_genres(fancy_cats)
        sim = helpers.create_similarity_matrix_categories(ut_cats)
    elif filtertype == "spacy":
        print("Creating needed variables...")
        sim = pd.read_msgpack("spacy_similarity.msgpack")
    else:
        print("Please enter a valid filtertype")
        return

    print("Starting calculations...")
    mses = {}
    # test the mse based on the length of the neighborhood
    for i in range(1, neighborhood_size + 1):
        predictions = helpers.predict_ratings(sim, ut, df[1], i).dropna()
        amount = len(predictions)
        mses[i] = helpers.mse(predictions)
    return mses, amount
Exemplo n.º 34
0
    def loadProject(self, filename=None, asksave=False):
        """Open project file"""

        w=True
        if asksave == True:
            w = self.closeProject()
        if w == None:
            return
        if filename == None:
            filename = filedialog.askopenfilename(defaultextension='.dexpl"',
                                                    initialdir=os.getcwd(),
                                                    filetypes=[("project","*.dexpl"),
                                                               ("All files","*.*")],
                                                    parent=self.main)
        if not filename:
            return
        if os.path.isfile(filename):
            #pb = self.progressDialog()
            #t = threading.Thread()
            #t.__init__(target=pd.read_msgpack, args=(filename))
            #t.start()
            data = pd.read_msgpack(filename)

        self.newProject(data)
        self.filename = filename
        self.main.title('%s - DataExplore' %filename)
        self.projopen = True
        return
Exemplo n.º 35
0
def read_output(path,layer_dims):
    """ Input: layer_dims, length 3 list of dimensions of the output of bottleneck layer
        Returns ndarray of shape (num_batches, layer_dims) """
    data = pd.read_msgpack(path)
    batch_size = data.index.size
    dims = [batch_size] + layer_dims
    return data.as_matrix().reshape(dims)
    def _load(self):
        df = pandas.read_msgpack(self.buf)
        for col in self.schema.cols:
            if isinstance(col, big_dt):
                # converting big_dt column
                df[col.name] = df[col.name].map(datetime.datetime.fromtimestamp, na_action='ignore')

        return df
Exemplo n.º 37
0
    def testLoad(self):
        """Test re-loading predictions"""

        infile = os.path.join(self.testdir, 'ZEBOVgp1.mpk')
        pred = pd.read_msgpack(infile)
        P = base.getPredictor('iedbmhc1')
        P.data = pred
        return
Exemplo n.º 38
0
def data():
    # Set CSS properties for th elements in dataframe
    try:
        rdb.get("data")
        data = pd.read_msgpack(rdb.get("data"))
        return render_template('index.html', data=data.to_html(index=False, justify='center', classes="table table-striped"))
    except Exception as e:
        return render_template('index.html', data="<p>no data found</p><br/>Exception: " + str(e) + "<p></br></br>run /timereport-fetch in slack first</p></br>")
    def load(self, filename, filetype=None):
        """Load file, if no filetype given assume it's msgpack format"""

        if filetype == '.pickle':
            self.df = pd.read_pickle(filename)
        else:
            self.df = pd.read_msgpack(filename)
        return
Exemplo n.º 40
0
    def load_msgpack(self, filename):
        """Load a msgpack file"""

        size = round((os.path.getsize(filename)/1.0485e6),2)
        print (size)
        df = pd.read_msgpack(filename)
        name = os.path.splitext(os.path.basename(filename))[0]
        self.load_dataframe(df, name)
        return
Exemplo n.º 41
0
 def get_val(self, key):
     ## mined에서 사용하게 될 모든 데이터는 TICKERS 데이터가 아니면 pandas df이다
     ## TICKERS 데이터는 리스트 형식이다
     if 'TICKERS' in key:
         data = self.redis_client.lrange(key, 0, -1)
         data = list(map(lambda x: x.decode('utf-8'), data))
     else:
         data = pd.read_msgpack(self.redis_client.get(key))  # 레디스에서 df 형식의 데이터를 가지고 오는 방법
         ### 참고: 레디스에 df를 저장하는 방법은: redis.set(key, df.to_msgpack(compress='zlib'))와 같은 형식이다
     return data
Exemplo n.º 42
0
 def msg_io(self, name, func, **kwargs):
     """Read data from msgpack. If not available, calculate and store."""
     cd = self.cache_dir()
     msgpath = os.path.join(cd, name + MSGTLD)
     if os.path.isfile(msgpath):
         data = pd.read_msgpack(msgpath)
     else:
         ensure_dir(cd)
         data = func(**kwargs)
         data.to_msgpack(msgpath)
     return data
Exemplo n.º 43
0
    def loadmsgpack(self, filename):
        """Load a msgpack file"""

        df = pd.read_msgpack(filename)
        name = os.path.splitext(os.path.basename(filename))[0]
        if hasattr(self,'sheets'):
            self.addSheet(sheetname=name, df=df)
        else:
            data = {name:df}
            self.newProject(data)
        return
Exemplo n.º 44
0
Arquivo: io.py Projeto: adgirish/ray
def read_msgpack(path_or_buf,
                 encoding='utf-8',
                 iterator=False):

    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

    port_frame = pd.read_msgpack(path_or_buf, encoding, iterator)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
Exemplo n.º 45
0
    def loadmsgpack(self, filename):
        """Load a msgpack file"""

        size = round((os.path.getsize(filename)/1.0485e6),2)
        print (size)
        df = pd.read_msgpack(filename)
        name = os.path.splitext(os.path.basename(filename))[0]
        if hasattr(self,'sheets'):
            self.addSheet(sheetname=name, df=df)
        else:
            data = {name:{'table':df}}
            self.newProject(data)
        return
Exemplo n.º 46
0
    def __get_list(self, list_name, kwargs):

        url = self.__attach_kwargs_to_url(
                self.server + '/list/{0}.{1}'.format(list_name, self.format),
                kwargs
            )

        if self.format == 'msgpack':
            return pd.read_msgpack(urlopen(url)).values.tolist()
        elif self.format == 'json':
            return pd.read_json(urlopen(url)).values.tolist()
        else:
            raise NotImplementedError('Unsupported format: {0}'.format(self.format))
Exemplo n.º 47
0
 def pull_df(self, md5):
     """Wrapper for the Workbench get_dataframe method
         Args:
             md5: pull the dataframe identified by this md5
         Returns:
             The uncompressed/unserialized dataframe
     """
     try:
         _packed_df = self.workbench.get_dataframe(md5)
         _df = pd.read_msgpack(lz4.loads(_packed_df))
         return _df
     except zerorpc.exceptions.RemoteError as e:
         return repr_to_str_decorator.r_to_s(self._data_not_found)(e)
Exemplo n.º 48
0
def initialize_mission_control(memory):

    # Get subscription sockets
    subscription_sender, subscription_receiver = subscription_sockets()

    # Initialize subscriptions
    initialize_subscriptions(subscription_sender)

    # Initialize Feed Handler
    while True:
        message = subscription_receiver.recv()
        ticker = message.split('_', 1)[0]
        message = pd.read_msgpack(message.split('_', 1)[1])
        print message
Exemplo n.º 49
0
def getPredictions(path,tag,method='tepitope',q=0.96):
    """Get predictions from file system"""

    q=round(q,2)
    #preds = OrderedDict()
    cutoffs = {}
    filename = os.path.join(path, tag+'.mpk')
    if not os.path.exists(filename):
        return
    df = pd.read_msgpack(filename)
    pred = base.getPredictor(name=method, data=df)
    cutoffs = pred.allelecutoffs = getCutoffs(path, method, q)
    pred = pred
    return pred
Exemplo n.º 50
0
    def openProject(self, filename=None):
        """Open project file"""

        if filename == None:
            filename = filedialog.askopenfilename(defaultextension='.dexpl"',
                                                    initialdir=os.getcwd(),
                                                    filetypes=[("project","*.dexpl"),
                                                               ("All files","*.*")],
                                                    parent=self.main)
        if not filename:
            return
        if os.path.isfile(filename):
            data = pd.read_msgpack(filename)
        self.newProject(data)
        self.filename = filename
        return
Exemplo n.º 51
0
def read(site_loc, sheet=0, verbose=True):
    "Read saved excel sheet into dataframe"
    fn = join("data", site_loc, "all_{}.msg".format(sheet))
    exfile = glob(join("data", site_loc, "*.xlsx"))[0]

    xl_workbook = xlrd.open_workbook(exfile)
    sheet_names = xl_workbook.sheet_names()
    del xl_workbook

    df = pd.read_msgpack(fn)

    if verbose:
        print("{} => {}".format(sheet_names, sheet_names[sheet]))
        with open(join("data", site_loc, "description.txt")) as f:
            print(f.read())
        print("Nulls: {} / {}".format(df["2"].isnull().sum(), len(df)))
    return df
Exemplo n.º 52
0
def param_table(e=None, query_str=QSTR, debug=False, rho_limits=None,
                use_cache=True):
    cached_table = files['params_cache']
    if path.isfile(cached_table) and use_cache:
        return pd.read_msgpack(cached_table)
    if e is None:
        if debug:
            e = test_events()
        else:
            e = pip2015events()
    if rho_limits is None:
        rho_limits = rholimits
    data = e.summary(col='paper', split_date=pd.datetime(2014,7,1))
    del(e)
    gc.collect()
    data = sf.apply_rho_intervals(data, rho_limits)
    if len(query_str)<1:
        return data
    return data.query(query_str)
Exemplo n.º 53
0
def param_table(e=None, cond=cond, debug=False, rho_limits=None,
                use_cache=True, split_date=pd.datetime(2014,7,1), **kws):
    cached_table = files['params_cache']
    if path.isfile(cached_table) and use_cache:
        return pd.read_msgpack(cached_table)
    if e is None:
        if debug:
            e = test_events(**kws)
        else:
            e = events(**kws)
    if rho_limits is None:
        rho_limits = RHO_LIMITS
    data = e.summary(col='paper', split_date=split_date)
    del(e)
    gc.collect()
    data = apply_rho_intervals(data, rho_limits)
    if cond is None:
        return data
    return data.where(cond)
Exemplo n.º 54
0
Arquivo: io.py Projeto: leeong05/orca
def read_frame(fname, ftype=None, return_ftype=False):
    if ftype is None:
        with magic.Magic() as m:
            ftype = m.id_filename(fname)
            if ftype[:4] == 'data':
                ftype = 'msgpack'
            elif ftype.find('ASCII') != -1 or ftype.find('Image') != -1:
                ftype = 'csv'
            elif ftype[:4] == '8086':
                ftype = 'pickle'
            else:
                ftype = None
    if ftype == 'msgpack':
        df = pd.read_msgpack(fname)
    elif ftype == 'csv':
        df = pd.read_csv(fname, header=0, parse_dates=[0], index_col=0)
    elif ftype == 'pickle':
        df = pd.read_pickle(fname)
    if ftype is not None:
        return (df, ftype) if return_ftype else df
    raise Exception('File type not recognized for {}'.format(fname))
Exemplo n.º 55
0
def testrun(gname):

    method = 'tepitope'#'iedbmhc1'#'netmhciipan'
    path='test'
    gfile = os.path.join(genomespath,'%s.gb' %gname)
    df = sequtils.genbank2Dataframe(gfile, cds=True)
    #names = list(df.locus_tag[:1])
    names=['VP24']
    alleles1 = ["HLA-A*02:02", "HLA-A*11:01", "HLA-A*32:07", "HLA-B*15:17", "HLA-B*51:01",
              "HLA-C*04:01", "HLA-E*01:03"]
    alleles2 = ["HLA-DRB1*0101", "HLA-DRB1*0305", "HLA-DRB1*0812", "HLA-DRB1*1196", "HLA-DRB1*1346",
            "HLA-DRB1*1455", "HLA-DRB1*1457", "HLA-DRB1*1612", "HLA-DRB4*0107", "HLA-DRB5*0203"]
    P = base.getPredictor(method)
    P.iedbmethod='IEDB_recommended' #'netmhcpan'
    P.predictProteins(df,length=11,alleles=alleles2,names=names,
                        save=True,path=path)
    f = os.path.join('test', names[0]+'.mpk')
    df = pd.read_msgpack(f)
    P.data=df
    #b = P.getBinders(data=df)
    #print b[:20]
    base.getScoreDistributions(method, path)
    return
Exemplo n.º 56
0
def getPredictions(label,genome,tag,q=0.96):
    """Get predictions from file system"""

    q=round(q,2)
    path = os.path.join(datapath, label)
    print path
    genomename = os.path.splitext(genome)[0]
    preds = OrderedDict()
    cutoffs = {}
    bcell = None
    for m in methods:
        rpath = os.path.join(path, '%s/%s' %(genomename,m))
        filename = os.path.join(rpath, tag+'.mpk')
        if not os.path.exists(filename):
            continue
        df = pd.read_msgpack(filename)
        pred = base.getPredictor(name=m, data=df)
        if m == 'bcell':
            bcell = pred
            continue
        cutoffs[m] = pred.allelecutoffs = analysis.getCutoffs(rpath, m, q)
        preds[m] = pred
    return preds, bcell, cutoffs
Exemplo n.º 57
0
def getAllBinders(path, method='tepitope', n=3, cutoff=0.95, promiscuous=True):
    """Get all promiscuous binders from a set of proteins in path"""

    print 'getting binders..'
    binders = []
    m=method
    if m=='bcell': return #not applicable
    l=9
    P = base.getPredictor(m)
    files = glob.glob(os.path.join(path, '*.mpk'))
    #get allele specific cutoffs
    P.allelecutoffs = getCutoffs(path, method, cutoff, overwrite=True)
    for f in files:
        df = pd.read_msgpack(f)
        if promiscuous== True:
            b = P.getPromiscuousBinders(data=df,n=n)
        else:
            b = P.getBinders(data=df)
        #print b[:5]
        binders.append(b)
    result = pd.concat(binders)
    result['start'] = result.pos
    result['end'] = result.pos+result.peptide.str.len()
    return result
Exemplo n.º 58
0
    def read(self, identifier, freq, **kwargs):
        """
            Read the entire timeseries record for the requested timeseries instance.

            :param identifier: Identifier of the timeseries.
            :type identifier: string
            :param freq: Timeseries data frequency.
            :type freq: string
            :param kwargs: Attributes to match against timeseries instances (e.g. source, measurand).
            :type kwargs: kwargs

            :returns: pandas.DataFrame -- Timeseries data.
        """
        url = self.__attach_kwargs_to_url(
                self.server + '/{0}/{1}.{2}'.format(identifier, freq, self.format),
                kwargs
            )

        if self.format == 'msgpack':
            return pd.read_msgpack(urlopen(url))
        elif self.format == 'json':
            return pd.read_json(urlopen(url))
        else:
            raise NotImplementedError('Unsupported format: {0}'.format(self.format))
Exemplo n.º 59
0
 def time_packers_read_pack(self):
     pd.read_msgpack(self.f)