Пример #1
0
 def get_coords_info(self):
     ds = xr.open_dataset(self.infile)
     for c in ds.coords:
         filename = os.path.join(self.coords_info, c.strip() + '.tabular')
         pd = ds.coords[c].to_pandas()
         pd.index = range(len(pd))
         pd.to_csv(filename, header=False, sep='\t')
Пример #2
0
def mc_distribution_to_distribution(mc_distribution,
                                    bins=10**3 + 1,
                                    to_csv=False,
                                    csv_file_name=None):

    count, binEdges = np.histogram(mc_distribution, bins)
    bincenters = .5 * (binEdges[1:] + binEdges[:-1])

    probs = [i / sum(count) for i in count]
    relative_moves = [bincenter / mean(bincenters) for bincenter in bincenters]
    pct_moves = [relative_move - 1 for relative_move in relative_moves]

    distribution_info = {
        'State': np.array(range(len(count))),
        'Prob': probs,
        'Pct_Move': pct_moves,
        'Relative_Move': relative_moves
    }

    if to_csv is True:
        distribution_df = distribution_info_to_distribution(
            distribution_info).distribution_df
        pd.to_csv(csv_file_name)

    return distribution_info_to_distribution(distribution_info)
Пример #3
0
 def run(self):
     global funds_df
     while self.ajax_queue.empty() is not True:
         fundcode = self.ajax_queue.get()
         url = '%s%s' % (BASE_URL, fundcode)
         response = self.session.get(url=url, headers=headers)
         fund_json = response.json()
         success_flag = fund_json['success']
         print(success_flag)
         print('='*30)
         fund_data = fund_json['data']
         fund_df = pd.DataFrame()
         if fund_data:
             fund_df = self.parse_data(fund_data)
             values = fund_df.loc('0').values
             gLock.acquire()
             # pd.concat([funds_df, fund_df], axis=0, ignore_index=True)
             # 第一种 CSV库写入方式
             with open(FILE_PATH, 'a', encoding='utf-8', newline='') as f:
                 writer = csv.writer(f)
                 writer.writerow(values)
             # 第二种 pandas库写入方式
             pd.to_csv(FILE_PATH, mode='a', header=False, index=False)
             gLock.release()
         else:
             pass
Пример #4
0
def save_sh():
    # A share list
    stock_code_list = ts.get_stock_basics().index.tolist()
    stock_list = []
    count = 0

    # traverse all stock code in list
    for code in stock_code_list:
        # get a single stock's data

        data_temp = ts.get_h_data(code, start = '2017-01-01', end = '2008-01-01')
        date_list = data_temp.index.tolist()

        print(data_temp)

        #hanlde the stock's daily data, get data out and instert into the list
        for date in date_list:
            data_sinday = data_temp.loc[date]
            open_price = data_sinday['open']
            high_price = data_sinday['high']
            close_price = data_sinday['close']
            low_price   = data_sinday['low']
            price_change = (close_price - open_price)/open_price

            #add data to list
            stock_list.append({'code': code, 'date' : date, 'open' : open_price, 'high' : high_price, \
                           'low' : low_price, 'close' : close_price, 'change':price_change})

    data_all_sh = pd.DataFrame(stock_list)
    print(data_all_sh)


    #! save data to file

    pd.to_csv(path, data_all_sh)
Пример #5
0
def write_to_csv(pd):

    if os.path.exists('E:\python\物联网云端数据处理实验-5-2018119161-廖瑞金\output.csv'):
        try:
            pd.to_csv('E:\python\物联网云端数据处理实验-5-2018119161-廖瑞金\output.csv',
                      mode='a',
                      index=False,
                      encoding="gbk",
                      header=False)
            #print(pd)
            print('Successful writing!')
            time.sleep(3)
        except:
            print('error')
    else:
        try:
            pd.to_csv('E:\python\物联网云端数据处理实验-5-2018119161-廖瑞金\output.csv',
                      mode='a',
                      encoding="gbk",
                      index=False)
            #print(pd)
            print('Successful writing!')
            time.sleep(3)
        except:
            print('error!')
Пример #6
0
def get_table_and_cols(TABLE, saveFile):
#--------------------------------------
    
    try:
        
        if TABLE.startswith( '$' ):

            pd = C.export_to_pandas('SELECT * FROM {table_name!q}', {'table_name': TABLE})
           
            pd.to_csv((TABLE + '.csv') , na_rep = 0)

            pd_rows, pd_cols = pd.shape

            log_and_print(f'EXPORTED ' + str({pd.shape[0]}) + f'rows from table {TABLE}')

        else:

            myFile = C.export_to_file(saveFile, TABLE, export_params={"with_column_names":True})

            stmt = C.last_statement()

            log_and_print("EXPORTED " + str(stmt.rowcount()) + " rows from table " + TABLE + " in " + str(stmt.execution_time) + " sec")


    except Exception as e:

        log_and_print("#######################################")

        log_and_print("ERROR: unable to READ table " + TABLE +  " Aborting with no action taken!")

        log_and_print("#######################################")

        log_and_print(e)

        sys.exit(12)  
Пример #7
0
def create_result(task1_csv_path='results.csv'):
    # -------------------------
    # Argument
    # task1_csv_path: path of task 1 csv
    # ------------------------- 
    dtf = pd.read_csv(task1_csv_path)
    field_rank = {
        'SELLER' : 1,
        'ADDRESS' : 2,
        'TIMESTAMP' : 3,
        'TOTAL_COST' : 4
    }

    res = []
    for key, row in dtf.iterrows():
        name = row['img_id']
        name = name.split(".")[0]
        annot_path = os.path.join('result_txt', name+".txt")
        image_path = os.path.join('upload', name+".jpg")

        output_dict = get_submit_image(image_path, annot_path)
        field_value, field_name = get_output(output_dict)
        print(field_name)
        print(field_value)
        field = list(zip(field_value, field_name))
        field.sort(key = lambda x: field_rank[x[1]])
        res.append('|||'.join([str(x[0]) for x in field]))

    dtf['anno_texts'] = res
    if not os.path.exists("submit"):
        os.mkdir("submit")

    pd.to_csv(os.path.join('submit', 'results.csv'))
Пример #8
0
def generate_challenge_run(model, test_data, run_file=None):
    """Generate the test predictions in the format of the runs for the challenge.
       Optionally write the run in a csv file

        EXAMPLE:
        with open('experiments/random_forest_model.pkl', 'wb') as f:
            model = pickle.load(f)

        test_data = '../data/test_size1_noclc_scaled_pca.csv'
        run_file = 'runs/random_forest_run.csv'

        run_df = generate_challenge_run(model, test_data, run_file)
        display(run_df)
    """
    # loading the test environmental data
    test_df = pd.read_csv(test_data, sep=';', header='infer', quotechar='"')

    # the resulting run dataframe
    run_df = pd.DataFrame(
        columns=['glc19TestOccId', 'glc19SpId', 'Rank', 'Probability'])

    # TO DO: FINISH IT
    # ...

    # save the run in a csv file
    pd.to_csv(run_file, sep=';', index=False, quotechar='"')
Пример #9
0
def stock_data(
    code=None,
    start_time=None,
    end_time=None,
):
    path = ct.PATH % (os.getcwd(), code, time, ktype)
    pd.to_csv(os.getcwd())
Пример #10
0
def remove_stops_file(filepath):
    datafile = pd.read_csv(filepath)
    tweets_list = datafile['Tweet']
    filtered_tweets_list = []
    for tweet in tweets_list:
        filtered_tweet = remove_stops(tweet, stop_words_std)
        filtered_tweets_list.append(filtered_tweet)
    new_tweet_list_column_values = pd.Series(filtered_tweets_list)
    datafile.insert(loc=-1, column='filtered_tweet', value = new_tweet_list_column_values)
    pd.to_csv(filepath + 'filtered', datafile, index = 'FALSE')
Пример #11
0
def main():
    with open('futmondo.html', 'r', encoding="utf8") as f:
        transfers = f.readlines()
        transfers = [x.strip() for x in transfers]

    transfers = [transform_data(t) for t in transfers]

    pd.DataFrame(transfers.items(),
                 columns=['Player', 'Date', 'Time', 'Amount', 'From', 'To'])
    pd.to_csv('test.csv')
Пример #12
0
def getEntropyData(startIndex, type, termList, workList, year, meal):
    """
    获得数据
    """
    stype, entropyDict = figureEntropy(startIndex, type, termList, workList,
                                       year, meal)
    pd = DataFrame(entropyDict)
    print pd
    result = 'D:/GraduationThesis/Data/' + stype + '_entropy.csv'
    pd.to_csv(result)
Пример #13
0
def stringify_pandas(pd):
    """

    :param pd: A Dataframe
    :return:
    """
    output = StringIO()
    pd.to_csv(output)

    pt = prettytable.from_csv(output)
    print(pt)
Пример #14
0
def stringify_pandas(pd):
    """

    :param pd: A Dataframe
    :return:
    """
    output = StringIO()
    pd.to_csv(output)

    pt = prettytable.from_csv(output)
    print(pt)
Пример #15
0
def store_best_attributes_for_label(nb, df, class_feature, cor_score=0.01):
    '''
    Store the best attributes in the corresponding file.
    Parameters
    ==========
    nb: int
        Label to use.
    + Same parameters as best_cor_attributes().
    '''
    relevant_features = best_cor_attributes(nb, df, class_feature, cor_score)
    pd.to_csv(relevant_features, path_best_features(nb))
Пример #16
0
    def fetchTimeTable(self, refresh=False):

        logging.debug("Fetching Time Table...")

        if not refresh:
            try:
                logging.debug("Loading time table.")
                self.timeTable = pd.read_csv("data/" + self.username +
                                             "_TimeTable.csv")
                return
            except Exception as e:
                logging.error("No time table found.")
                self.timeTable = None

        try:
            logging.debug("Loading driver...")
            driver = webdriver.Chrome()
            logging.info("Driver loaded.")

            driver.get(self.url)

            logging.debug("Finding elements...")
            user = driver.find_element_by_id("tbUserName")
            pswd = driver.find_element_by_id("tbPassword")

            logging.debug("Filling creds...")
            user.send_keys(self.username)
            pswd.send_keys(self.password)
            pswd.send_keys(Keys.RETURN)
            logging.info("Logged in.")

            logging.debug("Closing pop-ups...")
            pop_up = driver.find_element_by_id("popup_ok")
            pop_up.send_keys(Keys.RETURN)

            logging.debug("Fetching Time Table text...")
            tt = driver.find_element_by_id("divMytimetable")

            logging.info("Time Table fetched!")

            self.timeTable = self.parseRawTimeTable(tt)
            pd.to_csv("data/" + self.username + "_" + "TimeTable.csv",
                      index=False)

        except Exception as e:
            logging.exception(e)
            logging.error("Unable to fetch time table!")

        finally:
            driver.close()
Пример #17
0
 def write(self, obj, filename):
     '''
     Writes ebLink object to file.
     '''
     if filename == None:
         filename = 'links_' + datetime.today().strftime('%y%m%d-%H%M%S') + '.ebout'
     try:
         pd.to_csv(obj, filename)
     except:
         try:
             pd.to_csv(pd.DataFrame(obj), filename)
         except:
             with open(filename, w) as f:
                 f.write(obj)
 def write_csv(self, obj, name, mode="None", index=False, header=False, check_suffix=True):
     """
     write csv
     """
     if mode == "data":
         name = osp.join(self.rootdir, self.inputdir, name)
     elif mode == "model":
         name = osp.join(self.rootdir, self.pretraindir, name)
     elif mode == "result":
         name = osp.join(self.rootdir, self.outputdir, name)
     if check_suffix and name.split('.')[-1] != "csv":
         name+=".csv"
     pd.to_csv(obj, name, index=index, header=header)
     if self.notify:
         self.notify.notify(text=f"Export : {name}")
Пример #19
0
def export_data(dt_from,
                dt_to,
                states=None,
                districts=None,
                crops=None,
                water_sources=None):
    def parse_string(s):
        r = '('
        for i in s:
            r += i + ' ,'
        r = r[:-2]
        r += ')'
        return r

    import pandas as pd
    import sqlalchemy as sql
    conn = sql.create_engine(
        'postgresql://*****:*****@ec2-54-221-220-59.compute-1.amazonaws.com/db70oouohkh4bj'
    )
    query = 'SELECT * FROM fact_table, weather_dim, crop_dim, location_dim ' + ' WHERE fact_table.location_id=location_dim.location_id ' + ' irrigation_date >= ' + '\'' + str(
        dt_from) + '\' AND irrigation_date <= ' + '\'' + str(dt_to) + '\''
    if states != None:
        query += ' AND state IN ' + parse_string(states)
    if districts != None:
        query += ' AND district IN ' + parse_string(districts)
    if crops != None:
        query += ' AND crop_name IN ' + parse_string(crops)
    if water_sources != None:
        query += ' AND water_source IN ' + parse_string(water_sources)
    conn.close()
    return pd.to_csv(pd.read_sql(query + ';', conn))
def get_submission(model, loader, test_ids, device):
    all_preds = []
    with torch.no_grad:
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            score = model(x)
            prediction = score.float()

            all_preds += prediction.tolist()
    model.train()

    df = pd.DataFrame({
        "ID_code": test_ids.values,
        "target": np.array(all_preds)
    })

    pd.to_csv("../data/santander/sub.csv", index=False)
Пример #21
0
def main():
    logging.getLogger().setLevel(logging.DEBUG)
    if not os.path.isdir(f'{NFLSTATS_DIR}/data'):
        os.mkdir(f'{NFLSTATS_DIR}/data')
    if not os.path.isdir(f'{NFLSTATS_DIR}/data/draft'):
        os.mkdir(f'{NFLSTATS_DIR}/data/draft')
    
    years = 1992,2019+1 # 1992 is first year that targets were recorded
    for year in range(*years):
        logging.info('scraping for year %s', year)
        url = f'https://www.pro-football-reference.com/years/{year}/draft.htm'
        page = urlopen(url)
        soup = BeautifulSoup(page, 'lxml')
        # select() instead of find() returns a list
        table_rows = soup.select('#drafts tr')
        pd = get_players(table_rows, ignore_cols=['career_av', 'draft_av', 'college_id', 'college_link'])
        pd['year'] = year
        fout = f'{NFLSTATS_DIR}/data/draft/class_{year}.csv'
        pd.to_csv(fout, index=False)
Пример #22
0
def r1_ensemble():
    result_files = ['results_testb_vgg19.csv', 'results_testb_bcnn.csv', 'results_testb_vgg19_2.csv']
    results = []
    for result in result_files:
        results.append(pd.read_csv(os.path.join(output_dir, result)))

    img_names = []
    preds = []
    for i in range(len(results[0])):
        filename = results[0].loc[i]['filename']
        prob = 0
        for x in range(3):
            prob += weight[x] * results[x].loc[i]['probability']
        img_names.append(filename)
        preds.append(prob)

    preds = [round(x-1e-4, 4) for x in preds]
    data = {'filename':img_names, 'probability':preds}
    pd = pd.DataFrame(data, columns=['filename', 'probability'])
    pd.to_csv('../output/results_testb.csv', index=False)
Пример #23
0
    def add_new_data(JH_master, last_pull, JH_data_dir, root_dir):

        # identify csv's to add to master
        JH_files = os.listdir()[1:-1]

        new_JH_files = JH_files[JH_files.index(last_pull):]
        new_JH_files = [JH_data_dir + '/' + x for x in new_JH_files]

        for i in new_JH_files:

            new_data = pd.read_csv(i)

            new_data['Last_Update'] = pd.to_datetime(
                new_data['Last_Update'], format='%Y-%m-%d %H:%M:&S')

            new_data['Last_Update'] = new_data['Last_Update'].dt.date

            JH_master = pd.concat([JH_master, new_data], ignore_index=True)

        pd.to_csv(root_dir + '/' + JH_master)
def create_model():
    print("Begin Classificaton....")
    feature_csv = 'D:\\My Source Codes\\Projects-Python' \
                  '\\TextBaseEmotionDetectionWithEnsembleMethod\\NewDataset\\features6clL.csv'
    RFmodel_save_csv = 'D:\\My Source Codes\\Projects-Python\\TextBaseEmotionDetectionWithEnsembleMethod\\Models\\RF\\'
    DTmodel_save_csv = 'D:\\My Source Codes\\Projects-Python\\TextBaseEmotionDetectionWithEnsembleMethod\\Models\\DT\\'
    MLPmodel_save_csv = 'D:\\My Source Codes\\Projects-Python\\TextBaseEmotionDetectionWithEnsembleMethod\\' \
                        'Models\\MLP\\'
    pd = DataFrame(columns=('ModelType', 'ModelName', 'Score', 'F1-Score', 'ErrorRate', 'Feature-Count', 'Train-Size'))
    x, y = loaddata(feature_csv, 100)
    for i in range(1, 500):
        np.random.seed(42)
        indices = sample(range(1, x.shape[0]), 6000)
        test_size = int(0.1 * len(indices))
        X_train = x[indices[:-test_size]]
        Y_train = y[indices[:-test_size]]
        X_test = x[indices[-test_size:]]
        Y_test = y[indices[-test_size:]]

        ModelName = "Model_KNN_" + str(i) + ".pkl"
        F1_Score, Score, ErrorRate = rf_model(RFmodel_save_csv + ModelName, X_train, Y_train
                                              , X_test, Y_test)
        pd.loc[len(pd)] = ["KNN ", ModelName , Score, F1_Score, ErrorRate, 0, 0]
        print(ModelName + ", Model Type=KNN , With Score Result " + str(Score) + " and Feature Count="
              + str(100))

        ModelName = "Model_RF_" + str(i) + ".pkl"
        F1_Score, Score, ErrorRate = dt_model(DTmodel_save_csv + ModelName, X_train, Y_train, X_test, Y_test)
        pd.loc[len(pd)] = ["Random Forest", ModelName, Score, F1_Score, ErrorRate, 0, 0]
        print(ModelName + ", Model Type=Random Forest , With Score Result " + str(Score) + " and Feature Count="
              + str(100))

        ModelName = "Model_MLP_" + str(i) + ".pkl"
        F1_Score, Score, ErrorRate = mlp_model(MLPmodel_save_csv + ModelName, X_train, Y_train, X_test, Y_test)
        pd.loc[len(pd)] = ["MLP Neural Network", ModelName, Score, F1_Score, ErrorRate, 0, 0]
        print(ModelName + ", Model Type=Neural Network , With Score Result " + str(Score) + " and Feature Count="
              + str(100))

    pd.to_csv("D:\\My Source Codes\\Projects-Python\\TextBaseEmotionDetectionWithEnsembleMethod\\Models\dataset.csv",
              mode='a', header=True, index=False)
    print("End Classification...")
Пример #25
0
def write_to_csv(pd):
    if os.path.exists('output.csv'):
        try:
            pd.to_csv('output.csv',
                      mode='a',
                      index=False,
                      encoding="gbk",
                      header=False)
            # print(pd)
            print('Successful writing!')
            time.sleep(3)
        except:
            print('error')
    else:
        try:
            pd.to_csv('output.csv', mode='a', encoding="gbk", index=False)
            # print(pd)
            print('Successful writing!')
            time.sleep(3)
        except:
            print('error!')
Пример #26
0
def measure_board(input_filename):
    tuner_set = pd.read_csv(input_filename)
    output = pd.data_frame()
    for i in range(len(tuner_set)):
        this_tuner = measure_tuner(tuner_set[i])
        local_pcb = local_pcb_height(tuner_set[i])
        tuner_depth = this_tuner[1] - local_pcb - pcb_thickness
        tuner_length = this_tuner[1] - this_tuner[0]
        output.append(tuner_set[i][0],tuner_set[i][1],tuner_depth,tuner_length)
        #add the x, y, depth, and length values to the ourput data frame.
    output_csv = pd.to_csv(output)
    return(output_csv)
Пример #27
0
def main():

    dir = '../data'
    data = pandas.read_csv(join(dir, 'train.csv'))

    data = impute(data)
    data = normalize(data)
    print('-------- entire data set after normalize -------------')
    print(data.head(20))
    print(data.isnull().sum())

    train, validate, test = numpy.split(
        data.sample(frac=1),
        [int(.6 * len(data)), int(.8 * len(data))])

    X = train.values[:, 1:]
    y = train.values[:, 0:1]

    test_X = test.values[:, 1:]
    test_y = test.values[:, 0:1]

    # rt = None
    print('-------- test data ---------')
    rt = pandas.read_csv(join(dir, 'test.csv'))
    passenger_ids = rt[['PassengerId']].values[:, 0]

    print(rt.head(20))
    print(rt.isnull().sum())
    rt = impute(rt)
    rt = normalize(rt)
    print('-------- test data after processing ---------')
    print(rt.head(20))
    print(rt.isnull().sum())

    prediction = build_model(X, y, test_X, test_y, rt.values)
    pd = pandas.DataFrame()
    pd['PassengerId'] = passenger_ids
    pd['Survived'] = prediction
    pd.to_csv(join(dir, 'submission.csv'), index=False)
Пример #28
0
def main():
    from_http = bool(int(sys.argv[1]))
    file_name = str(sys.argv[2])
    data = read_data(file_name=file_name, from_http=from_http)
    data = process_data(data)
    pd.to_csv(get_dataset_directory() / "2018_processed_data.zip",
              compression="zip")
    save_name = str(sys.argv[3])
    if save_name == '2018_harmonized_shipper_sym':
        bag_of_words = create_BoW_harmonized_shipper(data)
        alpha = 'symmetric'
    elif save_name == '2018_harmonized_shipper_asym':
        bag_of_words = create_BoW_harmonized_shipper(data)
        alpha = 'asymmetric'
    elif save_name == '2018_shipper_harmonized_sym':
        bag_of_words = create_BoW_shipper_harmonized(data)
        alpha = 'symmetric'
    elif save_name == '2018_shipper_harmonized_asym':
        bag_of_words = create_BoW_shipper_harmonized(data)
        alpha = 'asymmetric'
    else:
        print('not reconize')
    corpus = create_corpus(bag_of_words, save_name, save=True)
    id2word = create_id2word(bag_of_words, save_name, save=True)
    num_topics = int(sys.argv[4])
    model = compute_lda(save_name, corpus, num_topics, id2word, alpha=alpha)

    # Fro visualization
    dictionary = Dictionary.from_corpus(corpus, id2word=id2word)
    save_pyldavis2html(model, corpus, dictionary, save_name, num_topics)
    # For document_topic_distribution
    document_topic_distribution(corpus,
                                bag_of_words,
                                model,
                                save_name,
                                num_topics,
                                minimum_probability=0.10)
Пример #29
0
def two_layer_stack(train_x, train_y, test, mode = "full", save_path = "", save_name = ""):
    if mode == "full":
        clf_names = ["XGBRegressor", "LGBMRegressor", "Lasso", "Ridge"]
        classifiers = [
            XGBRegressor(max_depth = 3, learning_rate = 0.1, n_eatimators = 100),
            LGBMRegressor(num_leaves = 31, max_depth = 4, learning_rate = 0.1, n_estimators = 100),
            Lasso(alpha = 0.1),
            Ridge(alpha = 0.5)
        ]
        layer_1_train, layer_1_test = stack_layer(clf_names, classifiers, train_x, train_y, test)

    if mode == "save":
        clf_names = ["XGBRegressor", "LGBMRegressor", "Lasso", "Ridge"]
        classifiers = [
            XGBRegressor(max_depth = 3, learning_rate = 0.1, n_eatimators = 100, n_jobs = -1),
            LGBMRegressor(num_leaves = 31, max_depth = 4, learning_rate = 0.1, n_estimators = 100, n_jobs = -1),
            Lasso(alpha = 0.1),
            Ridge(alpha = 0.5)
        ]
        layer_1_train, layer_1_test = stack_layer(clf_names, classifiers, train_x, train_y, test)
        pd.to_csv(layer_1_train, save_path + save_name)
        pd.to_csv(layer_1_test, save_path + save_name)

    if mode == "read":
        layer_1_train = pd.read_csv(save_path + save_name)
        layer_1_test = pd.read_csv(save_path + save_name)

    clf_names = ["XGBRegressor", "LGBMRegressor", "Lasso", "Ridge"]
    classifiers = [
        XGBRegressor(max_depth = 3, learning_rate = 0.1, n_eatimators = 100),
        Lasso(alpha = 0.1)]
    _, layer_2_test = stack_layer(clf_names, classifiers, layer_1_train, train_y, layer_1_test)

    layer_2_test = np.average(layer_2_test,axis = 1)

    return layer_2_test
Пример #30
0
def get_game_play_by_play(game_id, write_to_file_path):
    if os.path.isfile(write_to_file_path):
        return pd.read_csv(write_to_file_path)

    pbp_url = f'{NBA_BASE_URL}/playbyplayv2'
    headers = {'User-Agent': 'test',}
    params = {
        'EndPeriod': '4',
        'EndRange': '55800',
        'GameID': game_id,
        'RangeType': '2',
        'Season': '2019-20',
        'SeasonType': 'Regular Season',
        'StartPeriod': '1',
        'StartRange': '0',
    }
    response = requests.get(pbp_url, headers=headers, params=params)
    results = response.json()['resultSets']
    headers = results[0]['headers']
    plays = results[0]['rowSet']
    df = pd.DataFrame(plays, columns=headers)
    pd.to_csv(write_to_file_path, index=False)

    return df
Пример #31
0
def prep_germany(cached=False):
    '''
    
    This function pulls and preps the Germany Energy Consumption dataframe for exploration
    
    if cached == False: collects the csv from the url
    if cached == True: pulls the already saved dataframe
    
    '''

    if cached == False:
        # url to opsd_germany_daily.csv
        url = 'https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv'
        # uses pull_csv function from acquire.py to collect the dataset
        df = a.pull_csv(url)
        # caches the dataset as a csv
        df = pd.to_csv('opsd_germany_daily.csv')

    # cached == True
    else:
        # pulls csv as data from
        df = pd.read_csv('opsd_germany_daily.csv')

    # Lowercases the columns and renames 'wind+solar' columns to 'wind_and_solar'
    df.columns = df.columns.str.lower()
    df.rename(columns={'wind+solar': 'wind_and_solar'}, inplace=True)

    # Conver date to datetime and set date as index
    df.date = pd.to_datetime(df.date)
    df.set_index(df.date, inplace=True)

    # Creates the month and year columns
    df['month'] = df.index.month
    df['year'] = df.index.year

    # Fills nulls with 0
    df.fillna(0, inplace=True)

    return df
Пример #32
0
	// fields to ignore in the output
	eliminate = [] if keep_original else list(fields_to_bucket)
    if ignore:
        eliminate.extend(ignore)
    columns = [column for column in columns if column not in eliminate]
    // check if we have the necessary fields for complete PUMAs
    if full_pumas:
    	if ('STATEFIP' not in columns or 'PUMA' not in columns) or 'PUMASUPR' not in columns:
    	    raise ValueError("For the --full_pumas option to work, you need to have both 'PUMA' and 'STATEFIP' in your original data request.")
    reader = pd.read_fwf(input_file, encoding='utf-8', dtype=str, header=None, names=names, colspecs=colspecs, chunksize=chunksize)
    for dataframe in reader:
	    if full_pumas:
	        dataframe['PUMA'] = data['PUMASUPR'] if 'PUMASUPR' in dataframe.columns else dataframe['STATEFIP'] + dataframe['PUMA']
	    mode = 'a' if os.path.exists(output_file) else 'w'
	    header = (mode == 'w')
	    pd.to_csv(data, encoding='utf-8', sep='\t', mode=mode, header=header, index=False, quoting=QUOTE_NONE)
	    total += len(dataframe)
	    print('Finished parsing {total} lines in {delta} seconds.'.format(total=total, delta=delta))
	print('Wrote output to {output_file}.'.format(output_file=output_file)
"""


def create_bucketer(bucket):
    def _bucketer(value):
        for definition in bucket:
            if definition['lower'] <= float(value) <= definition['upper']:
                return definition['label'] 
            else:
                return value
    return lru_cache(_bucketer)
    
Пример #33
0
def save_table(tab, pwd):
    import pandas as pd
    pd.to_pickle(pwd)
    pd.to_csv(pwd)
Пример #34
0
url = "https://pypi.python.org/pypi/%s/json"

# Load functions
folder = "/home/vanessa/Documents/Dropbox/Code/Python/repofish/analysis/pypi"
packages = pandas.read_csv("%s/pypi.tsv" %folder,sep="\t",index_col=0)

meta_folder = "%s/packages" %(folder)

if not os.path.exists(meta_folder):
    os.mkdir(meta_folder)

# We will keep track of rows to drop
drop = []

for row in packages.iterrows():
    package_name = row[1].package
    output_file = "%s/%s.json" %(meta_folder,package_name)
    if not os.path.exists(output_file):
        time.sleep(1)
        print "parsing %s of %s" %(row[0],packages.shape[0])
        response = requests.get(url %package_name)
        if response.status_code == 200:
            save_json(response.json(),output_file)
        else:
            print "Error getting meta data for package %s" %(package_name)
            drop.append(row[0])

# Remove the functions we don't have meta data for
packages = packages.drop(drop)
pandas.to_csv(packages,"%s/pypi_filtered.tsv" %folder,sep="\t")
Пример #35
0
"""
Appends bad data that has tripped up my algorithm before.
"""
import pandas as pd

df = pd.read_csv('good_data.csv')

# Have some categories be missing.
df = df[df.purpose != 'credit_card']
df = df[df.home_ownership != 'RENT']
df = df[df.grade != 'A']

# Populate with 0 values
df.annual_inc.iloc[[2,3,45,12]] = 0

pd.to_csv('bad_data.csv')
Пример #36
0
import os, sys; from pdb import set_trace
from dectree import *
import pandas as pd 
walk = os.walk

def explore(dir):
 datasets = []
 for (dirpath, dirnames, filenames) in walk(dir):
    datasets.append(dirpath)

 training = []
 testing = []
 for k in datasets[1:]:
  train = [[dirPath, fname] for dirPath, _, fname in walk(k)]
  test = [train[0][0] + '/' + train[0][1].pop(-1)]
  training.append([train[0][0] + '/' + p for p in train[0][1] if not p == '.DS_Store']);
  testing.append(test)
 return training, testing


train, test = explore('./')
data = [train[i]+test[i] for i in xrange(1,len(test))]
template=pd.read_csv(data[0][0], header = 0).columns.get_values().tolist();
for i in data[1:]:
 for k in i:
  tmp = pd.read_csv(k)
  tmp.to_csv(path_or_buf = k ,  header = template, index=False)
set_trace()
pd.to_csv(data[1][0], header= template)
'''
Builds a model (TO BE DETERMINED).

Authors:
Paul Hendricks

Date:
2015-12-15

inputs:
train.h5
test.h5
sample_submission.csv

outputs:
01-model.h5
'''

# Load libraries
import pandas as pd

# Load data
prepped_path = './data/prepped/'
train = pd.read_hdf(prepped_path + 'train.h5', 'table')
test = pd.read_hdf(prepped_path + 'test.h5', 'table')
submission = pd.read_csv('./data/submission/sample_submission.csv')

# Write data out
pd.to_csv('./data/models/01-model.h5')
Пример #38
0
import codecs
import pandas as pd
from spelling.features import levenshtein_distance as dist
from spelling.jobs import DistanceToNearestStem

df = pd.read_csv('data/aspell-dict.csv.gz', sep='\t', encoding='utf8')
job = DistanceToNearestStem()
df = job.run(df.word, dist)
# TODO: Merge the two aspell-dict files.
pd.to_csv('data/aspell-dict-distances.csv.gz', index=False, sep='\t',
        encoding='utf8', compression='gz')
Пример #39
0
#过滤,不改变x的值
x.drop(1) #根据index,不改变x的值
x.drop(x.index[1]) #根据位置,不改变x的值
x[0.1 != x.values]

'''
#数据导入、导出与处理
'''

#数据导入
pd.read_csv(file,encoding)
pd.read_table(file,name=[,],sep="", encoding)
pd.read_excel(fiel,sheetname,names,engine) # engine=python可以解决中文路径问题

#数据导出
pd.to_csv(filepath, sep=",", index=True, header=True) #index是否导出行索引,header是否导出列索引

#数据去重
df.duplicated() #找出行重复信息,bool值
df.duplicated(index,key) #找出列重复,根据key的值,key可以省略
df.drop_duplicates() #去重
df.drop_duplicates(index) #按列去重

#数据缺失
df.read_csv(file,na_value) #读取文件是将某个值指定为na
df.isnull() #取得是否为空的nool DataFrame
df[df.isnull()] #过滤查找NA所在行或者列
df.dropna() #去除包含na的数据行
df.fillna() #填充

#信息抽取
Пример #40
0
#!/usr/bin/env python
import os
os.chdir("../UCS/")
import pandas as pd
UCS_mappings = pd.read_csv("UCS.mappings.csv", index_col=0, usecols=['barcode','TCGA.ID','histology'])
a = UCS_mappings
UCS_patient = pd.read_csv('../Minfi_Output/UCS_mset.csv', index_col=0)
UCS =UCS_patient.head().stack().head()
UCS
b = UCS_patient
c= b.transpose()
c.index.name = 'barcode'
d = c
final = pd.merge(a,d, on=a.index, how='outer')
finals = final.head()
finals.rename(columns={'key_0':'barcode'}, inplace=True)
final['Sample_Type'] = ["Normal" if '-20A-' in col else "Tumor" for col in final['TCGA.ID']] 
final = pd.to_csv('UCS_df.csv')

###															###
###															###
###			   			IMPORTING DATA						###
###															###
###															###
###############################################################

'''
read csv
'''

dframe = pd.read_csv('file_name.csv', header = None)
dframe = pd.read_csv('file_name.csv', header = None, nrows = 20)
dframe = pd.read_table('file_name.txt', sep = ';' , header = None)

dframe = pd.to_csv('output_file_name.csv')

'''
read html
pip install beautiful-soup
pip install html5lib
'''

url = 'https://www.fdic.gov/bank/individual/failed/banklist.html'
from pandas import read_html

dframe_list = pd.io.html.read_html(url) # read from url and puts the data into a list of dataframe objects
dframe = dframe_list[0]

'''
read excel