Exemplo n.º 1
0
def getArrivalRates(infile1,
                    infile2,
                    infile3,
                    Origin,
                    slotInMinutes=5,
                    windowInMinutes=20,
                    numWindows=9,
                    Destination=None):
    '''
    ------------------------------------------------------------------------------------------------------
    returns the arrival rates using data from past days
    infile1, infile2, and infile3 are csv files containing trip info. on similar days
    For NYC data: Feb 07, Feb 14, and Feb21 same hour intervals
    ------------------------------------------------------------------------------------------------------
    '''
    dDict1, head1 = readCSV(infile1)  # read data on previous days
    dDict2, head2 = readCSV(infile2)
    dDict3, head3 = readCSV(infile3)
    lamSlots1, lamMint1 = getLamPerRegionMLE(dDict1, Origin, slotInMinutes,
                                             windowInMinutes, numWindows,
                                             Destination)
    lamSlots2, lamMint2 = getLamPerRegionMLE(dDict2, Origin, slotInMinutes,
                                             windowInMinutes, numWindows,
                                             Destination)
    lamSlots3, lamMint3 = getLamPerRegionMLE(dDict3, Origin, slotInMinutes,
                                             windowInMinutes, numWindows,
                                             Destination)
    lamSlots = dict()
    lamMint = dict()
    for elem in lamSlots1:
        lamSlots[elem] = np.mean(
            [lamSlots1[elem], lamSlots2[elem], lamSlots3[elem]])
        lamMint[elem] = np.mean(
            [lamMint1[elem], lamMint2[elem], lamMint3[elem]])
    return lamSlots, lamMint, lamSlots1, lamSlots2, lamSlots3
Exemplo n.º 2
0
def prepare_data():

    datas = np.asarray(readCSV(args.train_feature_file))
    # print(np.array(datas).shape)
    # feat2idx = {}
    # idx2feat = {}
    # fid2idx = {}
    # idx2fid = {}
    # for idx, line in enumerate(datas):
    #     if idx == 0:
    #         feats = line.split(',')[1:]
    #         for feat_idx in range(len(feats)):
    #             feat2idx[feats[feat_idx]] = feat_idx
    #             idx2feat[feat_idx] = feats[feat_idx]
    #     else:
    #         fid = line.split(',', 1)[0]
    #         fid2idx[fid] = idx - 1
    #         idx2fid[idx - 1] = fid
    trainX = datas[1:, 1:]
    logger.info('Selected train features shape = ( %d , %d )', trainX.shape[0],
                trainX.shape[1])
    datas = np.asarray(readCSV(args.train_ans_file))
    trainY = datas[1:, 1:]
    logger.info('Selected train ans shape = ( %d )', trainY.shape[0])
    datas = np.asarray(readCSV(args.test_feature_file))
    testX = datas[1:, 1:]
    testFid = datas[1:, 0]
    logger.info('Selected test features shape = ( %d , %d )', testX.shape[0],
                testX.shape[1])
    return trainX, trainY, testX, testFid  #, feat2idx, idx2feat, fid2idx, idx2fid
Exemplo n.º 3
0
    def __init__(self,
                 config_dir,
                 train_close_pricesDF=None,
                 trainDF=None,
                 val_close_pricesDF=None,
                 valDF=None,
                 load_val=True,
                 nrows=None):
        """Data class. You have the option of loading all four dataframes to avoid reloading them."""
        self.train_close_pricesDF = None
        self.trainDF = None
        self.val_close_pricesDF = None
        self.valDF = None
        self.train_minutes = None
        self.val_minutes = None
        self.config_dir = config_dir
        self.load_val = load_val

        if (train_close_pricesDF is not None and trainDF is not None
                and val_close_pricesDF is not None and valDF is not None):
            self.train_close_pricesDF = train_close_pricesDF.copy()
            self.trainDF = trainDF.copy()
            self.val_close_pricesDF = val_close_pricesDF.copy()
            self.valDF = valDF.copy()
            if nrows is not None:
                print('presupplied data with nrows =', nrows,
                      '... subsetting the DFs.')
                self.train_close_pricesDF = self.train_close_pricesDF[:nrows]
                self.trainDF = self.trainDF[:nrows]
                self.val_close_pricesDF = self.val_close_pricesDF[:nrows]
                self.valDF = self.valDF[:nrows]

        else:
            print('loading train_close_pricesDF')
            self.train_close_pricesDF = readCSV(
                config_dir + 'Postprocessed_Data/train_close_prices.csv',
                nrows=nrows)
            print('loading trainDF')
            self.trainDF = readCSV(config_dir +
                                   'Postprocessed_Data/train_minutesDF.csv',
                                   nrows=nrows)
            print('load train complete')

            if load_val:
                print('loading val_close_pricesDF')
                self.val_close_pricesDF = readCSV(
                    config_dir + 'Postprocessed_Data/val_close_prices.csv',
                    nrows=nrows)
                print('loading valDF')
                self.valDF = readCSV(config_dir +
                                     'Postprocessed_Data/val_minutesDF.csv',
                                     nrows=nrows)
                print('load val complete')

        self.splitMinutes()
Exemplo n.º 4
0
def main():
    attendFileName = sys.argv[1]
    statsFileName = sys.argv[2]
    attendData = utils.readCSV(attendFileName)
    statsData = utils.readCSV(statsFileName)

    people = {}
    for row in statsData:
        p = Person(row)
        people[p.name] = p

    utils.getGitHubStats('asdf')
Exemplo n.º 5
0
 def globalBouncedEmails(self, csv):
     csv = utils.readCSV(csv)
     if csv[0][4] == "Global Bounce" and csv[0][1] == "Portal Bounce":
         emails = [row[0] for row in csv if row[1] == "TRUE" or row[4] == "TRUE"]
         return emails
     else:
         print "4th column of the CSV file is not Global Bounce"
def main():

    # List of sentence objects
    sentences = readCSV(STSS_131_DATA)
    STSS_values = []
    scores = []
    final_data = []
    init = 66
    for i, j in enumerate(sentences):
        # for i in range(0,3):
        result = measureSimilarity(sentences[i].first_sentence,
                                   sentences[i].second_sentence)
        # scores.append((init, result))
        print(f" CASE {init}")
        # if init == 127:
        #     break
        if isinstance(result, str):
            pass
        else:
            scores.append(result)
            final_data.append((init, result))
            STSS_values.append(j.human_SS)
        init += 1

    # print(final_data)
    p = stats.pearsonr(scores, STSS_values)
    print(
        f"Pearsons  Correla-tion  Coefficient  against  Human  Judgement: {p}")
    with open('new_method_no_punc_stopword.csv', 'w') as f:
        writer = csv.writer(f)
        for row in final_data:
            writer.writerow(row)
Exemplo n.º 7
0
def main():
    args = processCommandline()
    occupancyData = utils.readCSV(args['input'])
    chargesData = json.loads(open(args['charges']).read())

    print(chargesData)

    month = args['month']
    year = args['year']
    chargeDay = args['chargeday']
    dueDay = args['dueday']
    dueMonth = args['duemonth']

    if (dueMonth == '-'): dueMonth = month

    transformedRecords = []
    for ocd in occupancyData:
        if (ocd['Residing'] == ''): continue  # refugee flat, ignore
        trec = transformToApnaComplexFormat(ocd, chargesData, month, year,
                                            chargeDay, dueDay, dueMonth)
        print(ocd, trec)

        for rec in trec:
            transformedRecords.append(rec)

    writeOutput(transformedRecords, args['output'])
Exemplo n.º 8
0
def getQuantiMonth(month):
    all_df = []
    for f in os.listdir('../../data/quanti/' + month):
        df = readCSV(os.path.join('../../data/quanti/' + month, f), dtype=str)
        all_df.append(df)

    all_df = pd.concat(all_df)
    all_df.set_index("gigyaid", inplace=True)
    cols = all_df.columns
    new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols)
    for col in cols:
        if col == "contentswatched":
            # print(getContentsUnique(all_df, col))
            contents = getContentsUnique(all_df, col)
            new_df = pd.merge(new_df,
                              contents,
                              left_index=True,
                              right_on='gigyaid')
            new_df.drop("contentswatched_x", axis=1, inplace=True)
            new_df.rename({"contentswatched_y": "contentswatched"},
                          axis=1,
                          inplace=True)
        else:
            all_df[col] = all_df[col].astype(float)
            new_df[col] = getSum(all_df, col)[col].values
    toCSV(new_df, "../../data/aggregated/quanti" + month + ".csv")
Exemplo n.º 9
0
def main():
    random.seed(23333)
    # imgs, other_infos = utils.deserialize("../data", fontList)
    # imgs_shuffled = []
    # for i in imgs:
    #     indices = [j for j in range(i.shape[0])]
    #     random.shuffle(indices)
    #     imgs_shuffled.append(i[indices])
    # train = np.array([utils.hog(img) for i in imgs_shuffled for img in i[:len(i) * 4 // 5]])
    # train = np.reshape(train, [train.shape[0], -1])
    # train_label = np.array([i for i, _ in enumerate(imgs) for j in _[:len(_) * 4 // 5]])
    # test = np.array([utils.hog(img) for i in imgs_shuffled for img in i[len(i) * 4 // 5:]])
    # test = np.reshape(test, [test.shape[0], -1])
    # test_label = np.array([i for i, _ in enumerate(imgs) for j in _[len(_) * 4 // 5:]])

    imgs, other_infos = utils.readCSV(
        "../fonts",
        True,
        lambda x: int(x['m_label']) < 128 and int(x[
            'm_label']) not in [83, 84, 85, 115, 116, 117],
        fontList=fontList)
    train = np.array([utils.daisy(img) for i in imgs for img in i])
    train = np.reshape(train, [train.shape[0], -1])
    train_label = np.array([i for i, _ in enumerate(imgs) for j in _])

    imgs, other_infos = utils.readCSV(
        "../fonts",
        True,
        lambda x: int(x['m_label']) in [83, 84, 85, 115, 116, 117],
        fontList=fontList)
    test = np.array([utils.daisy(img) for i in imgs for img in i])
    test = np.reshape(test, [test.shape[0], -1])
    test_label = np.array([i for i, _ in enumerate(imgs) for j in _])

    for k in [1, 4, 8, 16, 24, 32]:
        print('k =', k, end=' ')
        tp = 0
        for idx, t in enumerate(test):
            lst, dist = utils.nearest_neighbour(t, train)
            cnt = [0 for j in range(len(fontList))]
            for i in lst[:k]:
                cnt[train_label[i]] += 1
            if max(cnt) == cnt[test_label[idx]]:
                tp += 1
        print('acc =', tp / test.shape[0])
Exemplo n.º 10
0
def populateDB(filename,
               csv_delimiter,
               header,
               language='EN',
               dbname='TwitterDB',
               mode=0,
               serialized=False):
    start = time.time()
    h, lines = utils.readCSV(filename, csv_delimiter, header)
    populateDatabase(lines, language, dbname, mode, serialized)
    end = time.time()
    print "time_populate.append(", (end - start), ")"
Exemplo n.º 11
0
def main():
    args = processCommandline()
    contactsData = utils.readCSV(args['input'])

    db, cursor = dbutils.dbConnect()
    dbutils.createTable(
        db, cursor, "Owner",
        ("building text", "flat_number text", "primary_first_name text",
         "primary_last_name text", "primary_email text", "primary_mobile text",
         "secondary_email text", "secondary_mobile text", "flat_type text"),
        True)

    importData(db, cursor, contactsData, "Owner")
Exemplo n.º 12
0
def getOrderedServiceSingle(infile1,
                            region,
                            slotInMinutes=5,
                            windowInMinutes=20,
                            numWindows=9):
    '''
    ------------------------------------------------------------------------------------------------------
    gets the service distribution from past data
    ------------------------------------------------------------------------------------------------------
    '''
    dDictSer, headSer = readCSV(infile1)
    dictofLists = getOrderedService(dDictSer, region, slotInMinutes,
                                    windowInMinutes, numWindows)
    return dictofLists
Exemplo n.º 13
0
def main():
  csvName = 'data/original.csv'
  csv_lines = readCSV(csvName)
  spec_num = len(csv_lines) - 1
  shuffle_seed_name = 'data/shuffle_seed.npy'
  shuffle_seed = np.random.permutation(spec_num)
  with open(shuffle_seed_name, 'wb') as frs:
    np.save(frs, shuffle_seed)

  shuffle_csv(csv_lines, 'data/shuffle.csv', shuffle_seed)

  spec_length = 1473
  spec_name = 'data/original.bin'
  spec_shuffle_name = 'data/shuffle.bin'
  shuffle_bin(spec_name, spec_shuffle_name, shuffle_seed, spec_length=spec_length)
Exemplo n.º 14
0
def getQualiMonth(month):
    all_df = []
    for f in os.listdir('../../data/quali/' + month):
        df = readCSV(os.path.join('../../data/quali/' + month, f),
                     converters=converters)
        all_df.append(df)

    all_df = pd.concat(all_df)
    all_df.set_index("gigyaid", inplace=True)
    cols = all_df.columns
    new_df = pd.DataFrame(index=all_df.index.unique(), columns=cols)
    for col in cols:
        all_df[col] = all_df[col].apply(lambda x: [i.upper() for i in x])
        new_df[col] = getUnique(all_df, col)[col].values
    new_df.index.name = "gigyaid"

    toCSV(new_df, "../../data/aggregated/quali" + month + ".csv")
Exemplo n.º 15
0
def getArrivalRatesSingle(infile1,
                          Origin,
                          slotInMinutes=5,
                          windowInMinutes=20,
                          numWindows=9,
                          Destination=None):
    '''
    ------------------------------------------------------------------------------------------------------
    gets the arrival rates using 
    data from a single past day observations
    NYC data: use Feb07, 2018 data to predict Feb14, 2018 rates
    ------------------------------------------------------------------------------------------------------
    '''
    dDictArate, headArate = readCSV(infile1)
    lamSlots, lamMint = getLamPerRegionMLE(dDictArate, Origin, slotInMinutes,
                                           windowInMinutes, numWindows,
                                           Destination)
    return lamSlots, lamMint
Exemplo n.º 16
0
def split_train_val():
    bin_name = 'data/train_tmp.bin'
    csv_name = 'data/train_tmp.csv'

    csv_train = 'data/train.csv'
    csv_val = 'data/val.csv'
    bin_train = 'data/train.bin'
    bin_val = 'data/val.bin'

    length = (10 + 1473) * 4
    csv_lines = readCSV(csv_name)
    csv_header = csv_lines[0]
    csv_lines = csv_lines[1:]
    train_length = int(len(csv_lines) * 0.9)

    with open(csv_train, 'wb') as fcsvtrain, open(bin_train, 'wb') as fbintrain,\
            open(csv_val, 'wb') as fcsvval, open(bin_val, 'wb') as fbinval,\
            open(bin_name, 'rb') as fbin:
        trainwriter = csv.writer(fcsvtrain)
        valwriter = csv.writer(fcsvval)
        trainwriter.writerow(csv_header)
        valwriter.writerow(csv_header)
        buf_train = buf_val = ''
        for i, line in enumerate(tqdm(csv_lines)):
            buf = fbin.read(length)
            if i < train_length:
                trainwriter.writerow(line)
                buf_train += buf
                if i % 100 == 0 and i != 0:
                    fbintrain.write(buf_train)
                    buf_train = ''
            else:
                valwriter.writerow(line)
                buf_val += buf
                if i % 100 == 0 and i != 0:
                    fbinval.write(buf_val)
                    buf_val = ''
        else:
            fbintrain.write(buf_train)
            fbinval.write(buf_val)
    bin_names = [bin_name, csv_name]
    bin_names.insert(0, 'rm')
    os.system(' '.join(bin_names))
Exemplo n.º 17
0
def split_train_test():
    bin_name = 'data/shuffle.bin'
    csv_name = 'data/shuffle_new.csv'
    test_no = ['3', '9', '19', '24']

    csv_train = 'data/train_tmp.csv'
    csv_test = 'data/test.csv'
    bin_train = 'data/train_tmp.bin'
    bin_test = 'data/test.bin'

    length = (10 + 1473) * 4
    csv_lines = readCSV(csv_name)
    csv_header = csv_lines[0]
    csv_lines = csv_lines[1:]

    with open(csv_train, 'wb') as fcsvtrain, open(bin_train, 'wb') as fbintrain,\
            open(csv_test, 'wb') as fcsvtest, open(bin_test, 'wb') as fbintest,\
            open(bin_name, 'rb') as fbin:
        trainwriter = csv.writer(fcsvtrain)
        testwriter = csv.writer(fcsvtest)
        trainwriter.writerow(csv_header)
        testwriter.writerow(csv_header)
        buf_train = buf_test = ''
        for i, line in enumerate(tqdm(csv_lines)):
            buf = fbin.read(length)
            if line[0] in test_no:
                testwriter.writerow(line)
                buf_test += buf
                if i % 100 == 0 and i != 0:
                    fbintest.write(buf_test)
                    buf_test = ''
            else:
                trainwriter.writerow(line)
                buf_train += buf
                if i % 100 == 0 and i != 0:
                    fbintrain.write(buf_train)
                    buf_train = ''
        else:
            fbintrain.write(buf_train)
            fbintest.write(buf_test)
Exemplo n.º 18
0
    return similarity


# S1 = "The president was assinated in his car"
# S2 = "The president was assinated and driver could do nothing"
# soc_sts(S1, S2)

# S1 = "Would you like to go out to drink with me tonight?"
# S2 = "I really don't know what to eat tonight so I might go out somewhere."

# soc_sts(S1, S2)

from scipy import stats

STSS_131_DATA = "data/STSS-131.csv"
sentences = readCSV(STSS_131_DATA)

sim_values, STSS_values = [], []
for s in sentences:
    # soc_sts(s.first_sentence, s.second_sentence)
    sim_values.append(soc_sts(s.first_sentence, s.second_sentence) * 4)
    STSS_values.append(s.human_SS)

print("******************************************************")
p = stats.pearsonr(sim_values, STSS_values)
print("soc_sts correlation with STSS: ", p)
"""
S1 = "Would you like to go out to drink with me tonight?"
S2 = "I really don't know what to eat tonight so I might go out somewhere."
soc_sts(S1, S2)
"""
Exemplo n.º 19
0
def createPnlVisPlot(config_dir, desired_vis_plot_fn, ts_window_size=5000):
    """
    Creates a pnl visualization plot and saves it in the Plots/ directory.

    ts_window_size determines how many timesteps to consider to determine the <worst> and <best> time windows.
    """
    # [x] find and load the trade_log
    #     filenames are pnl_vis_<model_name>_<sess_type>_<episode>_<total, best, worst>.png
    print('creating pnl vis plot:', desired_vis_plot_fn)

    #we need to first remove the model_str, then we can split by '_'
    before_model_str = desired_vis_plot_fn.split('[')[0]
    after_model_str = desired_vis_plot_fn.split(']')[-1]
    model_str = desired_vis_plot_fn.split(before_model_str)[1].split(after_model_str)[0]

    sess_type = after_model_str.split('_')[1]
    episode = after_model_str.split('_')[2]
    plot_type = after_model_str.split('_')[3].split('.')[0]
    sess_type = '' if sess_type == 'train' else sess_type #trade_log files are saved with the sesstype only if it != 'train'

    trade_log_fn = config_dir + 'Deep_Models/'+ model_str +'/Trade_Logs/'+sess_type+episode+'.csv'
    trade_logDF = readCSV(trade_log_fn)

    # [x] create plot
    full_logDF = trade_logDF.copy()
    full_logDF['Midpt'] = (full_logDF.C_B + full_logDF.C_A)/2
    full_logDF['DayPNL'] = (full_logDF.ActualAction!=0)*(full_logDF.C_B - full_logDF.Midpt) #it's always negative half the bid-ask spread
    full_logDF['OpenPNL'] = full_logDF.NewPosition.shift() * (full_logDF.Midpt - full_logDF.Midpt.shift())
    full_logDF.OpenPNL.loc[0] = 0
    full_logDF['TotalPNL'] = full_logDF.DayPNL + full_logDF.OpenPNL

    #subset the log if plotting best or worst window
    if plot_type == 'best':
        rolling_sum = full_logDF.TotalPNL.rolling(window=ts_window_size).sum()
        idx_max = rolling_sum[::-1].idxmax() # the [::-1] selects the last occurence of idxmax
        full_logDF = full_logDF[(idx_max-ts_window_size):idx_max]
    if plot_type == 'worst':
        rolling_sum = full_logDF.TotalPNL.rolling(window=ts_window_size).sum()
        idx_min = rolling_sum[::-1].idxmin() # the [::-1] selects the last occurence of idxmax
        full_logDF = full_logDF[(idx_min-ts_window_size):idx_min]

    full_logDF['CumPNL'] = full_logDF.TotalPNL.cumsum()
    # [x] graph (from http://kitchingroup.cheme.cmu.edu/blog/2013/09/13/Plotting-two-datasets-with-very-different-scales/)
    buys = full_logDF.loc[full_logDF.ActualAction == 1][['Minute', 'C_A']]
    sells = full_logDF.loc[full_logDF.ActualAction == 2][['Minute', 'C_B']]

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.plot(full_logDF.Minute, full_logDF.Midpt, 'k-', linewidth=0.5)
    ax1.plot(buys.Minute, buys.C_A, 'g+')
    ax1.plot(sells.Minute, sells.C_B, 'r+')
    ax1.set_ylabel('Price')
    # [x] add plot title
    ax1.set_title(plot_type)

    ax2 = ax1.twinx()
    ax2.plot(full_logDF.Minute, full_logDF.CumPNL, 'b-')
    ax2.set_ylabel('PNL')

    # [x] Add pnl=0 line
    # ax2.hlines(y=0, xmin=full_logDF.Minute.iloc[0] , xmax=full_logDF.Minute.iloc[-1], colors='0.75', linestyles='dashed')
    ax2.axhline(y=0, color='0.75', linestyle='dashed') # using this instead of hlines frees us from supplying xmin and xmax

    # save plot
    plt.savefig(config_dir+'Plots/'+desired_vis_plot_fn, dpi=256)
Exemplo n.º 20
0
def processData(comb_row, config_dir, day_chg_incs, minute_incs):
    """
    1. Locates the Postprocessed_Data/*. If no Postprocessed_Data/* exists, then...
    2. Locates the Data/* and generates Postprocessed_Data/*. If no Data/* exists, then...
    3. Generate Data/* then generate Postprocessed_Data/*.
    """
    # [x] Locates the PostprocessedData/*
    # [x] check that it is in the correct format.
    sec_guide_fn = config_dir + 'Data/sec_guide.csv'
    if os.path.exists(sec_guide_fn):
        sec_guideDF = pd.read_csv(sec_guide_fn)
        num_secs = len(sec_guideDF)
        postprocessed_data_dir = config_dir + 'Postprocessed_Data/'
        if (os.path.exists(postprocessed_data_dir + 'train_close_prices.csv')
                and os.path.exists(postprocessed_data_dir +
                                   'train_minutesDF.csv') and
                os.path.exists(postprocessed_data_dir + 'val_close_prices.csv')
                and
                os.path.exists(postprocessed_data_dir + 'val_minutesDF.csv')):
            print('All Postprocessed_Data files exist. Checking columns')

            cols_set_actual = set(
                pd.read_csv(postprocessed_data_dir + 'train_minutesDF.csv',
                            nrows=0).columns.tolist())
            cols_set_expected = set(
                getExpectedPostprocessedCols(num_secs, day_chg_incs,
                                             minute_incs))

            assert cols_set_actual == cols_set_expected, 'Problem with the columns. \
            \nCols in Postprocessed actual but not in expected: '                                                                 +str(cols_set_actual.difference(cols_set_expected))+\
            '\nCols in expected but not in Postprocessed actual: '+str(cols_set_expected.difference(cols_set_actual))

            print('Columns passed the check. Loading postprocessed data.')
            print('loading train_close_pricesDF')
            train_close_pricesDF = readCSV(postprocessed_data_dir +
                                           'train_close_prices.csv')
            print('loading train_minutesDF')
            train_minutesDF = readCSV(postprocessed_data_dir +
                                      'train_minutesDF.csv')
            print('loading val_close_pricesDF')
            val_close_pricesDF = readCSV(postprocessed_data_dir +
                                         'val_close_prices.csv')
            print('loading val_minutesDF')
            val_minutesDF = readCSV(postprocessed_data_dir +
                                    'val_minutesDF.csv')
            print('loading complete.')
            return train_close_pricesDF, train_minutesDF, val_close_pricesDF, val_minutesDF
        else:
            print(
                'There are Postprocessed_Data/* missing. Here are the files there currently:'
            )
            print(os.listdir(postprocessed_data_dir))
    else:
        print(sec_guide_fn, 'not found. Continuing...')

    print('\n')
    # [x] Locates or creates the Data/*
    data_dir = config_dir + 'Data/'
    daily_fn = data_dir + 'daily_summary.csv'
    all_minutes_fn = data_dir + 'all_minutes.csv'
    sec_guide_fn = data_dir + 'sec_guide.csv'

    all_minutesDF, dailyDF, sec_guideDF = 'not set yet', 'not set yet', 'not set yet'

    if not (os.path.exists(daily_fn) and os.path.exists(all_minutes_fn)
            and os.path.exists(sec_guide_fn)):
        print('Data/ has not been loaded and pre-processed yet. Doing so now.')
        all_minutesDF, dailyDF, sec_guideDF = loadAndProcessData(
            comb_row, config_dir, day_chg_incs, minute_incs)
    else:
        print('Data found! Loading data...')
        # [x] check the order of other_secs
        sec_guideDF = readCSV(sec_guide_fn)
        assert (comb_row.Sec1 == sec_guideDF.iloc[0].Sec)
        other_secs_comb_row = comb_row[[
            col for col in comb_row.index
            if (col[:3] == 'Sec' and col != 'Sec1')
        ]].values
        other_secs_comb_row = [
            i for i in other_secs_comb_row if type(i) == str
        ]
        other_secs_sec_guide = sec_guideDF.iloc[1:].Sec.values
        assert (np.all(other_secs_comb_row == other_secs_sec_guide))
        # [x] check the dates
        dailyDF = readCSV(daily_fn)
        first_date = dailyDF.Date.iloc[0]
        last_date = dailyDF.Date.iloc[-1]
        assert abs((pd.to_datetime(first_date) -
                    pd.to_datetime(comb_row.TrainStartDate)
                    ).days) < 20, str(first_date) + '   ' + str(
                        comb_row.TrainStartDate)  # may want to relax these
        assert abs(
            (pd.to_datetime(last_date) - pd.to_datetime(comb_row.ValEndDate)
             ).days) < 20, str(last_date) + '   ' + str(comb_row.ValEndDate)
        all_minutesDF = readCSV(all_minutes_fn)
        print('Data/ load complete.')
    return postprocessData(all_minutesDF, dailyDF, sec_guideDF, config_dir,
                           day_chg_incs, minute_incs)
Exemplo n.º 21
0
def loadAndProcessData(comb_row,
                       config_dir,
                       day_chg_incs,
                       minute_incs,
                       minute_dir=tdm_dir + 'Minute_Files/'):
    """
    1. Loads the minute files from the external hard drive
    2. Creates, saves, and returns all_minutesDF, sec_guideDF, and dailyDF
    """
    other_secs = comb_row[[
        col for col in comb_row.index if (col[:3] == 'Sec' and col != 'Sec1')
    ]].values
    other_secs = [i for i in other_secs if type(i) == str]
    print('No pre-loaded data found. Loading data for ' + comb_row.Sec1 +
          ' and ' + ','.join(other_secs))
    sec1_minuteDF = readCSV(minute_dir + comb_row.Sec1 + '.csv')
    sec1_minuteDF = sec1_minuteDF.loc[
        (sec1_minuteDF.Date >= comb_row.TrainStartDate)
        & (sec1_minuteDF.Date <= comb_row.ValEndDate)].reset_index(drop=True)

    print('Loading minuteDF for ' + comb_row.Sec2)
    other_secs_minuteDF = readCSV(minute_dir + comb_row.Sec2 + '.csv')[[
        'Product', 'Date', 'Minute', 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A',
        'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb', 'M_TickImb'
    ]]
    for sec in other_secs[1:]:
        print('Loading minuteDF for ' + sec)
        other_secs_minuteDF = other_secs_minuteDF.append(
            readCSV(minute_dir + sec + '.csv')[[
                'Product', 'Date', 'Minute', 'O_B', 'O_A', 'H_B', 'H_A', 'L_B',
                'L_A', 'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb',
                'M_TickImb'
            ]],
            ignore_index=True)
    other_secs_minuteDF = other_secs_minuteDF.loc[
        (other_secs_minuteDF.Date >= comb_row.TrainStartDate)
        & (other_secs_minuteDF.Date <= comb_row.ValEndDate)].reset_index(
            drop=True)
    print('other_secs_minuteDF has ' + str(len(other_secs_minuteDF)) +
          ' rows.')
    print('sec1_minuteDF has ' + str(len(sec1_minuteDF)) + ' rows.')
    print("readCSVs complete. Subsetting dates...")

    # [x] subset for dates
    dates_in_common = set(sec1_minuteDF.Date.unique())
    for sec in other_secs:
        print(sec)
        other_sec_dates = set(other_secs_minuteDF.loc[
            other_secs_minuteDF.Product == sec].Date.unique())
        print('removing', [
            str(d)
            for d in sorted(list(dates_in_common.difference(other_sec_dates)))
        ])
        dates_in_common = dates_in_common.intersection(other_sec_dates)
        print(len(dates_in_common), 'dates_in_common')
    sec1_dates_to_remove = set(
        sec1_minuteDF.Date.unique()).difference(dates_in_common)
    print(str(len(dates_in_common)) + ' dates_in_common')

    if len(sec1_dates_to_remove) > 0:
        print('- removing ' + str(len(sec1_dates_to_remove)) + ' dates from ' +
              comb_row.Sec1)
        sec1_minuteDF = sec1_minuteDF.loc[sec1_minuteDF.Date.isin(
            dates_in_common)].reset_index(drop=True)
    for sec in other_secs:
        sec_dates_to_remove = set(
            other_secs_minuteDF.loc[other_secs_minuteDF.Product == sec].Date.
            unique()).difference(dates_in_common)
        if len(sec_dates_to_remove) > 0:
            print('- removing ' + str(len(sec_dates_to_remove)) +
                  ' dates from ' + sec)
    other_secs_minuteDF = other_secs_minuteDF.loc[
        other_secs_minuteDF.Date.isin(dates_in_common)].reset_index(drop=True)
    print('other_secs_minuteDF has ' + str(len(other_secs_minuteDF)) +
          ' rows.')
    print('sec1_minuteDF has ' + str(len(sec1_minuteDF)) + ' rows.')
    print("Date subset complete. Determining each day's Open/Closes...")

    # [x] determine each day's open and close
    dailyDF = pd.DataFrame(columns=['Date', 'Open', 'Close'])
    dailyDF['Date'] = sec1_minuteDF.Date.unique()
    for i in tqdm(range(len(dailyDF))):
        date = dailyDF.loc[i].Date
        lastOpen = sec1_minuteDF.loc[sec1_minuteDF.Date == date].Minute.min()
        firstClose = sec1_minuteDF.loc[sec1_minuteDF.Date == date].Minute.max()
        other_sec_date_subDF = other_secs_minuteDF.loc[other_secs_minuteDF.Date
                                                       == date]
        for sec in other_secs:
            lastOpen = max(
                lastOpen, other_sec_date_subDF.loc[other_sec_date_subDF.Product
                                                   == sec].Minute.min())
            firstClose = min(
                firstClose, other_sec_date_subDF.loc[
                    other_sec_date_subDF.Product == sec].Minute.max())
        dailyDF.loc[i, 'Open'] = lastOpen
        dailyDF.loc[i, 'Close'] = firstClose
    dailyDF.Open = dailyDF.Open.dt.strftime(date_format='%H:%M')
    dailyDF.Close = dailyDF.Close.dt.strftime(date_format='%H:%M')
    dailyDF.to_csv(config_dir + 'Data/daily_summary.csv', index=False)
    print(
        "Each day's Open/Closes determination complete. Creating all_minutesDF..."
    )

    # [x] create all_minutesDF
    all_minutesDF = pd.DataFrame(columns=['Date', 'Minute'])
    # enumerate minutes
    for i in range(len(dailyDF)):
        open_dt = pd.to_datetime(
            dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' +
            dailyDF.loc[i].Open,
            format='%Y-%m-%d %H:%M')
        close_dt = pd.to_datetime(
            dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' +
            dailyDF.loc[i].Close,
            format='%Y-%m-%d %H:%M')
        minute_range = pd.date_range(start=open_dt, end=close_dt, freq='T')
        day_minutesDF = pd.DataFrame({
            'Date': minute_range.date,
            'Minute': minute_range.values
        })
        all_minutesDF = all_minutesDF.append(day_minutesDF, ignore_index=True)

    #populate minute data
    col_stems = [
        'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A', 'Count',
        'B_TickImb', 'A_TickImb', 'M_TickImb'
    ]
    first_minute_populate_stems = [
        'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A'
    ]
    for sec_num in range(1, len(other_secs) + 2):
        sec_cols = [col_stem + str(sec_num) for col_stem in col_stems]
        for sec_col in sec_cols:
            all_minutesDF[sec_col] = np.nan
    all_minutesDF[[c + '1' for c in col_stems
                   ]] = pd.merge(all_minutesDF[['Minute']],
                                 sec1_minuteDF[['Minute'] + col_stems],
                                 on='Minute',
                                 how='left')[col_stems]
    print('Merging into all_minutesDF...')
    for sec_num in range(2, len(other_secs) + 2):
        other_sec = other_secs[sec_num - 2]
        all_minutesDF[[c + str(sec_num) for c in col_stems]] = pd.merge(
            all_minutesDF[['Minute']],
            other_secs_minuteDF[['Minute'] + col_stems].loc[
                other_secs_minuteDF.Product == other_sec],
            on='Minute',
            how='left')[col_stems]
    print('Getting the first datapoint of each day...')

    #get first datapoint of each day
    for date in tqdm(dates_in_common):
        date = dailyDF.loc[i].Date
        if date in dates_in_common:
            open_dt = pd.to_datetime(
                dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' +
                dailyDF.loc[i].Open,
                format='%Y-%m-%d %H:%M')
            sec1_last_row = sec1_minuteDF.loc[(sec1_minuteDF.Date == date) & (
                sec1_minuteDF.Minute <= open_dt)].iloc[-1]

            if sec1_last_row.Minute < open_dt:
                if (open_dt - sec1_last_row.Minute).seconds / 60 > 20:
                    raise ValueError(
                        'Too much time has elapsed. ' + comb_row.Sec1 +
                        ' open quote is stale at ' +
                        open_dt.strftime(format='%Y-%m-%d %H:%M') + ' by ' +
                        str((open_dt - sec1_last_row.Minute).seconds / 60) +
                        ' minutes.')
                else:
                    all_minutesDF.loc[all_minutesDF.Minute == open_dt,
                                      [c + '1' for c in col_stems]] = 0
                    all_minutesDF.loc[
                        all_minutesDF.Minute == open_dt,
                        [c + '1' for c in first_minute_populate_stems
                         ]] = sec1_last_row[first_minute_populate_stems]

            other_secs_subDF = other_secs_minuteDF.loc[
                (other_secs_minuteDF.Date == date)
                & (other_secs_minuteDF.Minute <= open_dt)]
            for sec_num in range(2, len(other_secs) + 2):
                other_sec = other_secs[sec_num - 2]
                other_sec_last_row = other_secs_subDF.loc[
                    other_secs_subDF.Product == other_sec].iloc[-1]
                if other_sec_last_row.Minute < open_dt:
                    if (open_dt - other_sec_last_row.Minute).seconds / 60 > 20:
                        raise ValueError(
                            "Too much time has elapsed. " + other_sec +
                            " open quote is stale at " +
                            date.strftime(open_dt='%Y-%m-%d %H:%M') + ' by ' +
                            str((open_dt - other_sec_last_row.Minute).seconds /
                                60) + ' minutes.')
                    else:
                        all_minutesDF.loc[
                            all_minutesDF.Minute == open_dt,
                            [c + str(sec_num) for c in col_stems]] = 0
                        all_minutesDF.loc[all_minutesDF.Minute == open_dt, [
                            c + str(sec_num)
                            for c in first_minute_populate_stems
                        ]] = other_sec_last_row[first_minute_populate_stems]

    print('Saving all_minutesDF...')
    all_minutesDF.to_csv(config_dir + 'Data/all_minutes.csv', index=False)
    print('Save complete.')
    sec_guideDF = pd.DataFrame({'Sec': [comb_row.Sec1] + list(other_secs)})
    sec_guideDF.to_csv(config_dir + 'Data/sec_guide.csv', index=False)
    return all_minutesDF, dailyDF, sec_guideDF
Exemplo n.º 22
0
        numIdle['avg'].append(
            np.mean([
                numIdle[1][key], numIdle[2][key], numIdle[3][key],
                numIdle[4][key]
            ]))
        numBlocked['avg'].append(
            np.mean([
                numBlocked[1][key], numBlocked[2][key], numBlocked[3][key],
                numBlocked[4][key]
            ]))

    return efficiency, percentBlocked, numIdle, numBlocked, totaltime


if __name__ == '__main__':
    dDict, head = readCSV('data/ridesLyftMHTN14.csv')
    dDict = addTimeStamp(dDict, slotInMinutes=5)
    dictNum = computeNumberOfUsers(dDict)
    adjacency = {1: [2], 2: [1, 3, 4], 3: [2, 4], 4: [2, 3]}
    initSupply = {
        1: {
            'idle': 0,
            'active': 0,
            'arrived': 0,
            'departed': 0
        },
        2: {
            'idle': 0,
            'active': 0,
            'arrived': 0,
            'departed': 0
Exemplo n.º 23
0
def populateDB(filename, csv_delimiter, header, language='EN', dbname='TwitterDB', host='localhost', port=27017, mode=0, serialized=False):
    start = time.time() 
    h, lines = utils.readCSV(filename, csv_delimiter, header)
    populateDatabase(lines, language, dbname, host, port, mode, serialized)
    end = time.time() 
    print "time_populate.append(", (end - start), ")"
def populateDB(filename, csv_delimiter, header, language='EN', dbname='TwitterDB', mode=0):
    start = time.time() 
    h, lines = utils.readCSV(filename, csv_delimiter, header)
    populateDatabase(lines, dbname)
    end = time.time() 
    print "time_populate.append(", (end - start), ")"
Exemplo n.º 25
0
def main():
    args = processCommandline()
    contactsData = utils.readCSV(args['input'])
    transformedContactsData = transformData(contactsData)
    writeOutput(transformedContactsData, args['output'])
Exemplo n.º 26
0
                            words = []
                            for w in lemmas.wordList:
                                word = dict()
                                word['word'] = w.word
                                word['tf'] = w.tf
                                word['count'] = w.count
                                word['pos'] = w.wtype
                                words.append(word)

                            document['cleanText'] = cleanText#.encode('latin-1').encode('string_escape').replace('\r', '').replace('\n', '')
                            document['lemmaText'] = lemmaText
                            document['words'] = words
                except Exception as e:
                    print e, 'sunt in lemmaText'
        except Exception as e:
            print e, 'aici try 1', elem
    else:
        print 'aici in else', elem
    return document

if __name__ == "__main__":
    dbname = 'EGCDB'
    filename = 'RNTI_articles_export.csv'
    header = False
    csv_delimiter = '\t'
    corpus = utils.readCSV(filename)
    # createDB = CreateDB_EGC(dbname=dbname)
    # createDB.insert_data(corpus=corpus)
    print len(corpus)
    insert_data(dbname, corpus)
Exemplo n.º 27
0
    for i in range(len(folder_length)):
        dct[str(i + 1)] = [str(j) for j in range(1, folder_length[i] + 1)]
    label_dict = {}
    for i in range(1, len(folder_length) + 1):
        label_dict[i] = label_dict.get(i, {})
        for j in dct[str(i)]:
            label_dict[i][int(j)] = i
    return label_dict


if __name__ == '__main__':
    folder_length = [6, 10, 5, 6]
    # gen_path_dict(path)
    path_dict = get_path_dict('path_dict.pkl')
    label_dict = gen_label_dict()
    elements = [l[1:] for l in readCSV('data/elements.csv')[1:]]
    with open('data/original.bin', 'wb') as f, open('data/original.csv',
                                                    'wb') as fcsv:
        cw = csv.writer(fcsv)
        cw.writerow(['No', 'Class', 'SubNO', 'Category'] +
                    'Mn Si Ni Cr V Mo Ti Cu Fe'.split())
        for cls, subfolders in path_dict.items():
            for subno, filenames in subfolders.items():
                no = sum(folder_length[:cls - 1]) + subno
                label = label_dict[cls][subno]
                csv_list = [no, cls, subno, label] + elements[no - 1]
                for fn in filenames:
                    cw.writerow(csv_list)
                    spa_array = read_spa(fn)
                    f.write(struct.pack('<f', label))
                    np.array(elements[no - 1], np.float32).tofile(f)
def get_size():
    return len(readCSV('data/train.csv')[1:]),\
           len(readCSV('data/test.csv')[1:]),\
           len(readCSV('data/val.csv')[1:])
        'angel1k_o', 'angel3k_o', 'angel5k_o', 'angel10k_o', 'angel25k_o',
        'angel50k'
    ]
    models['Armadillo'] = [
        'Armadillo1k_o', 'Armadillo3k_o', 'Armadillo5k_o', 'Armadillo10k_o',
        'Armadillo25k_o', 'Armadillo35k'
    ]
    models['bunny'] = [
        'bunny1k_o', 'bunny3k_o', 'bunny5k_o', 'bunny10k_o', 'bunny25k_o',
        'bunny70k'
    ]
    numResolution = 6

    rotatePeriod = np.array([20, 40, 60, 80, 100, 200, 300, 400, 500, 600],
                            dtype=np.float)
    numRotates = rotatePeriod.shape[0]
    qualityOffset = [0, 1]
    qualityOpt = ['PSNR', 'SSIM']

    for modelId in range(4):
        # modelId = 0
        coarseResId = 2
        renderMode = '25'
        rootDir = "/Users/sfhan/Dropbox/stereoRepoj/Results"
        datafile = join(
            rootDir, modelName[modelId],
            models[modelName[modelId]][coarseResId] +
            "_rot1_{}_2.csv".format(renderMode))
        data = readCSV(datafile)
        plot_rotatePeriod_quality()
Exemplo n.º 30
0
def STSS_tests():
    """Some tests for wordNetSimilarity using STSS dataset"""
    import matplotlib.pyplot as plt

    sentences = readCSV(STSS_131_DATA)
    sim_values, STSS_values = [], []
    n = 0
    print("Using path_similarity")
    for s in sentences:
        val = wordNetSimilarity(s.first_sentence, s.second_sentence)
        sim_values.append(val)
        STSS_values.append(s.human_SS)
    print("******************************************************")
    print(stats.pearsonr(sim_values, STSS_values))

    sim_values, STSS_values = [], []
    n = 0
    for s in sentences:
        sim_values.append(
            wordNetSimilarity(s.first_sentence,
                              s.second_sentence,
                              use_wup=True))
        STSS_values.append(s.human_SS)

    print("******************************************************")
    print("Using wup")
    p = stats.pearsonr(sim_values, STSS_values)
    print(p)

    sim_values, STSS_values = [], []
    n = 0
    for s in sentences:
        sim_values.append(
            wordNetSimilarity(s.first_sentence,
                              s.second_sentence,
                              perform_stemming=True))
        STSS_values.append(s.human_SS)

    print("******************************************************")
    print("Preprocessing: Stemming")
    p = stats.pearsonr(sim_values, STSS_values)
    print(p)

    sim_values, STSS_values = [], []
    for s in sentences:
        sim_values.append(
            wordNetSimilarity(s.first_sentence,
                              s.second_sentence,
                              perform_lemmatization=True))
        STSS_values.append(s.human_SS)

    print("******************************************************")
    print("Preprocessing: lemmatization")
    p = stats.pearsonr(sim_values, STSS_values)
    print(p)

    sim_values, STSS_values = [], []
    n = 0
    for s in sentences:
        sim_values.append(
            wordNetSimilarity(s.first_sentence,
                              s.second_sentence,
                              use_idf=True))
        STSS_values.append(s.human_SS)

    print("******************************************************")
    print("Preprocessing: use tf_idf")
    p = stats.pearsonr(sim_values, STSS_values)
    print(p)
Exemplo n.º 31
0
import utils
from operator import itemgetter

ACHIEVEMENTS = utils.readCSV("./static/achievements.csv")
TITLES = utils.readJSON("./static/titles.json")


def get_max_level(achievement_list):
    max_points = 0
    max_level = 0
    for achv in achievement_list:
        if int(achv['points']) > 0:
            max_points += int(achv['points'])

    tmp_max_points = max_points

    while tmp_max_points / 2 > 0:
        max_level += 1
        tmp_max_points = int(tmp_max_points / 2)

    return max_level, max_points


def get_levelup_requirements():
    max_level, max_points = get_max_level(ACHIEVEMENTS)
    level_info = list()
    while max_points / 2 > 0:
        level_info.append(max_points)
        max_points = int(max_points / 2)

    return list(reversed(level_info))