Пример #1
0
def logistic_test_using_cosine(score_feature=False):
    logger.info('using cosine features in logistic regression')
    if score_feature:
        logger.info('also use score feature')
    Cs = [2**t for t in range(0, 10, 1)]
    Cs.extend([3**t for t in range(1, 10, 1)])
    snli2cosine = SNLI2Cosine('/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin')
    logger.info('loading snli data ...')
    train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t')
    train_df = train_df[pd.notnull(train_df.sentence2)]
    train_df = train_df[train_df.gold_label != '-']
    train_df = train_df[:(len(train_df) / 3)]
    train_df.reset_index(inplace=True)
    test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t')
    test_df = test_df[pd.notnull(test_df.sentence2)]
    test_df = test_df[test_df.gold_label != '-']
    test_df.reset_index(inplace=True)
    X_train, train_labels, X_test, test_labels = snli2cosine.calculate_cosine_features(train_df, test_df)
    if score_feature:
        y_train_proba, y_test_proba = joblib.load('./snli/logistic_score_snli.pkl')
        # y_train_proba = y_train_proba.flatten()
        # y_test_proba = y_test_proba.flatten()
        X_train = np.concatenate([X_train, y_train_proba.reshape((-1, 1))], axis=1)
        X_test = np.concatenate([X_test, y_test_proba.reshape((-1, 1))], axis=1)
    logger.info('X_train.shape: {0}'.format(X_train.shape))
    logger.info('X_test.shape: {0}'.format(X_test.shape))

    logreg = LogisticRegressionCV(Cs=Cs, cv=3, n_jobs=10, random_state=919)
    logreg.fit(X_train, train_labels)
    logger.info('best C is {0}'.format(logreg.C_))
    y_test_predicted = logreg.predict(X_test)
    acc = accuracy_score(test_labels, y_test_predicted)
    logger.info('test data predicted accuracy: {0}'.format(acc))
def validate_label_generation():
    mals1_df = pd.read_csv('data/sorted-train-labels-vs251-252.csv')
    mals2_df = pd.read_csv('data/sorted-train-labels-vs263-264-apt.csv')

    counter = 0
    m1_x = np.array(mals1_df['malware_type_x'])
    m1_f = np.array(mals1_df['family_name'])
    m1_sl = np.array(mals1_df['sample_label'])
    m1_fl = np.array(mals1_df['family_label'])
    m2_x = np.array(mals2_df['malware_type_x'])
    m21_f = np.array(mals2_df['family_name'])
    m2_sl = np.array(mals2_df['sample_label'])
    m2_fl = np.array(mals2_df['family_label'])
    
    for idx1, mname1 in enumerate(m1_x):
        for idx2, mname2 in enumerate(m2_x):
            if mname1 == mname2:
                if m1_sl[idx1] != m2_sl[idx2]:
                    print("Sample label incongruence: {:d} {:d}".format(m1_sl[idx1], m2_sl[idx2]))
                    counter += 1
                    
                if (m1_fl[idx1] != m2_fl[idx2]):
                    print("Family label incongruence: {:d} {:d}".format(m1_fl[idx1], m2_fl[idx2]))
                    counter += 1            
        
        if (idx1 % 1000) == 0:
            print("Processed {:d} malware names.".format(idx1))


    print("Total Incongruence Errors: {:d}".format(counter))
    
    return
Пример #3
0
    def __init__(self, path_to_file,
                 batch_size=32,
                 skip_header=False,
                 column_id=0,
                 column_label=1,
                 column_path=2):
        """
        Constructor. Just reads the file and creates the two lists to be used.
        :param path_to_file: Where the file resides
        :param batch_size: how many image to return as per minibatch
        :param skip_header: Does the file have a header?
        :param column_id: Column number on where the item ID resides
        :param column_label: Column number on where the label is stored
        :param column_path: Column number to get the relative path
        """
        try:
            if skip_header:
                corpus_df = pd.read_csv(path_to_file, header=None)
            else:
                corpus_df = pd.read_csv(path_to_file)
        except OSError:
            raise TK1CorpusBuilderError("{} not found".format(path_to_file))

        # let' shuffle the corpus
        corpus_df = corpus_df.sample(frac=1).reset_index(drop=True)

        #allright, let' store the lists then
        self.batch_size = batch_size
        self.ids = corpus_df[corpus_df.columns[column_id]].values
        self.labels = corpus_df[corpus_df.columns[column_label]].values
        self.image_path = corpus_df[corpus_df.columns[column_path]].values

        # we'e done here, deleting stuff
        del corpus_df
Пример #4
0
def file2dataframe():
    dir = "C:\\Users\\wyq\\Desktop\\WikiDataAnalyse\\data\\target_prediction\\"
    links = pd.read_csv(dir + 'links.tsv', sep='\t', header=None)
    paths = pd.read_csv(dir + 'paths_finished.tsv', sep='\t')
    paths["path"] = paths["path"].apply(lambda x: x.split(';'))
    vectors = normalize()
    return links, paths, vectors
def generate_sample_labels(av_report_file, out_report_file, label_file):
    mals = pd.read_csv(av_report_file)
    labels = pd.read_csv(label_file)
    # Now generate unique scalar label map, we will use WinDefender as the default classification, if WinDefender is OK
    # and ClamAV is not OK, then use the ClamAV classification, if both are OK then default to 0 label value for now.
    type_x = np.array(mals['malware_type_x'])
    type_y = np.array(mals['malware_type_y'])
    scalar_labels = [0] * mals.shape[0]
    counter, scalar_label_map = get_sample_labels(mals, labels) # Get the malware label dict.
    
    for idx, y_val in enumerate(type_y):
        if y_val != 'OK':
            mals.iloc[idx,1] = y_val # copy the defender classification to ClamAV classification

        # Now update the label map with a new scalar label values
        if mals.iloc[idx,1] not in scalar_label_map.keys():
            counter += 1
            scalar_label_map[mals.iloc[idx,1]] = counter
            
        # now get the scalar label for this malware sample
        scalar_labels[idx] = scalar_label_map[mals.iloc[idx,1]]

        if (idx % 1000) == 0: # report progress
            print("Processed label: {:d} {:s} -> {:d}.".format(idx, mals.iloc[idx,1], scalar_labels[idx]))
    
    mals['sample_label'] = scalar_labels

    mals.to_csv(out_report_file, index=False)

    save_sample_labels(scalar_label_map)
    
    return
Пример #6
0
def main():
    res = []
    num_iterations = params['num_iterations']
    early_stopping_round = params['early_stopping_round']
    print(params)
    for i in range(cnt):
        train_fea = pd.read_csv(root_path + 'train_score_{}.csv'.format(i))
        train_lab = pd.read_csv(root_path + 'label_{}.csv'.format(i))
        train_lab = train_lab.loc[:, 'label'].values

        lgb_train = lgb.Dataset(train_fea, train_lab)

        solver = lgb.train(params, lgb_train, \
                           valid_sets=[lgb_train], \
                           valid_names=['train'], \
                           verbose_eval=True, \
                           num_boost_round=num_iterations, \
                           early_stopping_rounds=early_stopping_round)

        pred_fea = pd.read_csv(root_path + 'res_score.csv')
        pred_fea = pred_fea.drop([i], axis=1).values
        res.append(solver.predict(pred_fea, num_iteration=solver.best_score))
        pd.DataFrame(np.array(res).T).to_csv(root_path + \
                                             'res_score2.csv', index=False)

    res = np.mean(res, axis=0)
    pred_pair = pd.read_csv(root_path + 'test1.csv')
    pred_pair['score'] = res
    pred_pair['score'] = pred_pair['score'].apply(lambda x: '{:.6f}'.format(x))
    pred_pair.to_csv(root_path + 'submission-5000-layer2.csv', index=False)
Пример #7
0
def gera(nome_teste, nome_pred):
    pred = pd.read_csv('dados/'+ nome_teste, delimiter=' ', usecols=[0, 1], header=None, names=['alvo', 'preco'])
    out = pd.read_csv('dados/'+ nome_pred, delimiter=' ', usecols=[0], header=None, names=['resultado'])
    
    print len(pred)
    print len(out)
    
    errosx = []
    errosy = []
    acertosx = []
    acertosy = []
    precosx = []
    precosy = []
    for i in range(0, len(pred)):
        precosx.append(i)
        precosy.append(float(pred['preco'][i][2:]))
        if pred['alvo'][i] == out['resultado'][i]:
            acertosx.append(i)
            acertosy.append(float(pred['preco'][i][2:]))
        else:
            errosx.append(i)
            errosy.append(float(pred['preco'][i][2:]))
            

    plt.plot(precosx, precosy)
    plt.plot(errosx, errosy, 'rx')
    plt.plot(acertosx, acertosy, 'x')
    plt.show()
Пример #8
0
def getData(folderList, shapes, trips, stopTimes, calendar, frequencies):
    for folder in folderList:
        print('Adding data from ' + folder + '.')

        # Read the files from the data.
        readShapes = pd.read_csv('../' + folder + '/shapes.txt')[shapeData]
        readTrips = pd.read_csv('../' + folder + '/trips.txt')[routeData]
        readStopTimes = pd.read_csv('../' + folder + '/stop_times.txt')[timeData]
        readCalendar = pd.read_csv('../' + folder + '/calendar.txt')[calendarData]

        # Append it to the existing data.
        shapes = pd.concat([shapes, readShapes])
        trips = pd.concat([trips, readTrips])
        stopTimes = pd.concat([stopTimes, readStopTimes])
        calendar = pd.concat([calendar, readCalendar])

        if os.path.isfile('../' + folder + '/frequencies.txt'):
            readFrequencies = pd.read_csv('../' + folder + '/frequencies.txt')
            frequencies = pd.concat([frequencies, readFrequencies])

         # Calculate the number of missing shapes.
        num_shapes = trips.groupby('route_id').size()
        num_validshapes = trips[trips.shape_id.isin(shapes.shape_id)].groupby('route_id').size()
        num_missingshapes = num_shapes - num_validshapes
        percent_missingshapes = num_missingshapes / num_shapes * 100
        print('Missing data from ' + folder + ':')
        num_missingshapesList = num_missingshapes[num_missingshapes != 0]
        if num_missingshapes.empty:
            print(num_missingshapes[num_missingshapes != 0])
            print(percent_missingshapes[percent_missingshapes != 0])
        else:
            print('No data missing.\n')

    return lists(shapes, trips, stopTimes, calendar, frequencies)
Пример #9
0
def main(output=RESULTS1B):
    """
    Using 1 nearest neighbor, predicts NYC Taxi trip times based on feature 
    vectors (pickup latitude, pickup longitude, dropoff latitude, dropoff latitude). 

    Tests on a subset of trip_data_1.csv

    Uses sklearn to implement nearest neighbors
    """
    features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 
               'dropoff_longitude', 'trip_time_in_secs']

    ## Extract necessary data into pandas dataframes
    numrows = 100000
    df_train_read = pd.read_csv(TRAIN_DATA)
    df_test_read = pd.read_csv(TRIP_DATA_1, nrows = numrows)    # first 100k rows, for speed
    df_test = df_test_read[features].dropna()
    df_train = df_train_read[features].dropna() 


    ## Use sklearn to run nearest neighbors
    k = 1 
    clf = KNeighborsClassifier(n_neighbors=k)                   # default distance metric: euclidean
    clf.fit(df_train[features[0:4]], df_train[features[-1]])
    preds = clf.predict(df_test[features[0:4]])

    # # Calculate statistics (Root Mean Squared Error, Correlation Coefficient, Mean Absolute Error)
    print "Calculating statistics"
    with open(output, "a+") as outputFile:
        outputFile.write("Ran knn with k={}".format(k) + \
            " Trained on {}. Tested on first".format(TRAIN_DATA) + \
            " {} rows of {}. Stats:".format(numrows, TRIP_DATA_1))
    calcAndLogStats( numpy.array(preds), 
                     numpy.array(df_test[features[-1]]), 
                     output=output)
Пример #10
0
def fit_montecarlo_tree(path_index, paths = None, index_filter=None, class_filter=None,
                        feature_filter=None, folds=10):
    """A diferencia de fit tree, este metodo recibe todos los paths. Entrena solo con uno, indicado
    por path index. Pero luego por orden, voy abriendo todos los sets para clasificar.
    """
    data = pd.read_csv(paths[path_index], index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)

    results = []
    for train_index, test_index in skf:
        train_X = data.iloc[train_index]
        train_y = y.iloc[train_index]

        clf = None
        clf = tree.Tree('gain', max_depth=10, min_samples_split=20)

        clf.fit(train_X, train_y)
        # result = clf.predict_table(test_X, test_y)
        # results.append(result)

    # Ahora clasifico con este arbol para todos los datasets
    for path in paths:
        data = pd.read_csv(path, index_col=0)
        data, y = utils.filter_data(data, index_filter, class_filter, feature_filter)

    return pd.concat(results)
Пример #11
0
def load_data(dev_mode=True):
    '''Loads data: dev_mode=True splits the train set in train and test'''
    # Load data
    node_info = pd.read_csv(pth('node_information.xls'), header=None)
    node_info.columns = ['id', 'date', 'og_title', 'authors', 'journal', 'og_abstract']
    
    train = pd.read_csv(pth('training_set.txt'), sep=' ', header=None)
    train.columns = ['id1', 'id2', 'link']
    
    test = pd.read_csv(pth('testing_set.txt'), sep=' ', header=None)
    test.columns = ['id1', 'id2']
    
    
    # Split train into train and test
    if dev_mode:
        prop = 0.75
        idx_perm = np.random.permutation(range(len(train)))
        test = train.iloc[idx_perm[int(len(train)*prop):]]
        train = train.iloc[idx_perm[:int(len(train)*prop)]]
    
    # pre-process node_info 
    if isinstance(node_info.authors.iloc[0], str) or isinstance(node_info.authors.iloc[0], float):
        node_info.authors = node_info.authors.str.split(', ')
        node_info.loc[node_info.authors.isnull(), 'authors'] = node_info[node_info.authors.isnull()].apply(lambda x: [], axis=1)
  
    return node_info, train, test
def run():
    batch_size = 4000

    global signatures
    signatures = get_pickled_signatures()

    pool = avito_utils.PoolWrapper(processes=4)
    name = 'ssim'

    print 'processing train data...'
    t0 = time()
    df = pd.read_csv('../input/ItemPairs_train.csv')
    delete_file_if_exists('features_%s_train.csv' % name)

    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        features = process_batch(batch, pool)
        append_to_csv(features, 'features_%s_train.csv' % name)

    print 'processing train data took %0.5fs' % (time() - t0)

    print 'processinig test data...'
    t0 = time()
    df = pd.read_csv('../input/ItemPairs_test.csv')
    delete_file_if_exists('features_%s_test.csv' % name)

    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        features = process_batch(batch, pool)
        append_to_csv(features, 'features_%s_test.csv' % name)
        
    print 'processing test data took %0.5fs' % (time() - t0)

    pool.close()
Пример #13
0
def load_dataset(path):
    stores_df = pandas.read_csv('data/store.csv')
    stores_df = stores_df.fillna(-1)
    stores_df['StoreType'] = LabelEncoder().fit_transform(stores_df['StoreType'])
    stores_df['Assortment'] = LabelEncoder().fit_transform(stores_df['Assortment'])
    # Dropping yields a better performance than:
    # - Giving each month a boolean column
    # - Replace the string with a count of the months
    stores_df = stores_df.drop('PromoInterval', axis=1)

    annotated_df = pandas.read_csv(path, parse_dates=['Date'], dtype={'StateHoliday': object})
    # Dropping yields a better performance than:
    # - Label encoding
    annotated_df = annotated_df.drop('StateHoliday', axis=1)
    # Ugly but fast way to convert Date column to useful, seperate columns
    (
        annotated_df['DayOfWeek'],
        annotated_df['IsWeekend'],
        annotated_df['DayOfMonth'],
        annotated_df['Month'],
        annotated_df['Year']
    ) = zip(*annotated_df['Date'].map(split_date))
    annotated_df = annotated_df.drop('Date', axis=1)
    annotated_df = annotated_df.fillna(-1)

    # Merging dataset and stores
    return pandas.merge(annotated_df, stores_df, on='Store', how='inner', sort=False)
def data_collection_stats():
	print(check_output(["ls", "../input"]).decode("utf8"))
	train_images = check_output(["ls", "../input/train_photos"]).decode("utf8")
	print(train_images[:])
	print('time elapsed ' + str((time.time() - config.start_time)/60))

	print('Reading data...')
	train_photos = pd.read_csv('../input/train_photo_to_biz_ids.csv')
	train_photos.sort_values(['business_id'], inplace=True)
	train_photos.set_index(['business_id'])

	test_photos = pd.read_csv('../input/test_photo_to_biz.csv')
	test_photos.sort_values(['business_id'], inplace=True)
	test_photos.set_index(['business_id'])

	train = pd.read_csv('../input/train.csv')
	train.sort_values(['business_id'], inplace=True)
	train.reset_index(drop=True)

	print('Number of training samples: ', train.shape[0])
	print('Number of train samples: ', len(set(train_photos['business_id'])))
	print('Number of test samples: ', len(set(test_photos['business_id'])))
	print('Finished reading data...')
	print('Time elapsed: ' + str((time.time() - config.start_time)/60))

	print('Reading/Modifying images..')

	return (train_photos, test_photos, train)
Пример #15
0
    def download_stock_list(self, response):
        exchange = response.meta['exchange']
        path = files_contract.get_security_list_path('stock', exchange)
        df = pd.read_csv(io.BytesIO(response.body), dtype=str)
        if df is not None:
            if os.path.exists(path):
                df_current = pd.read_csv(path, dtype=str)
                df_current = df_current.set_index('code', drop=False)
            else:
                df_current = pd.DataFrame()

            df = df.loc[:, ['Symbol', 'Name', 'IPOyear', 'Sector', 'industry']]
            df = df.dropna(subset=['Symbol', 'Name'])
            df.columns = ['code', 'name', 'listDate', 'sector', 'industry']
            df.listDate = df.listDate.apply(lambda x: to_time_str(x))
            df['exchange'] = exchange
            df['type'] = 'stock'
            df['id'] = df[['type', 'exchange', 'code']].apply(lambda x: '_'.join(x.astype(str)), axis=1)
            df['sinaIndustry'] = ''
            df['sinaConcept'] = ''
            df['sinaArea'] = ''
            df = df.set_index('code', drop=False)

            diff = set(df.index.tolist()) - set(df_current.index.tolist())
            diff = [item for item in diff if item != 'nan']

            if diff:
                df_current = df_current.append(df.loc[diff, :], ignore_index=False)
                df_current = df_current.loc[:, STOCK_META_COL]
                df_current.columns = STOCK_META_COL
                df_current.to_csv(path, index=False)
Пример #16
0
def map_GO_to_GTEX():
    inputFilename = '../data/GO_terms_final_gene_counts.txt'
    GO_list_file = open(inputFilename)
    GO_list = np.loadtxt(GO_list_file,skiprows=2,usecols=[0],dtype='S10',delimiter='\t')

    inputFilename = '../data/Tissue_Name_Mappings.csv'
    tissue_data = pd.read_csv(inputFilename,header=None)
    map_BTO_to_GTEX = defaultdict(list)

    for index,row in tissue_data.iterrows():
        GTEX_tissue = row[0]
        BTO_tissues = row[1:]
        for tissue in BTO_tissues.dropna():
            map_BTO_to_GTEX[tissue].append(GTEX_tissue)

    inputFilename = '../data/BTO_GO.csv'
    BTO_data = pd.read_csv(inputFilename,skiprows=[0])
    map_GO_to_GTEX = defaultdict(list)

    for index,row in BTO_data.iterrows():
        tissue = row[1]
        if tissue in map_BTO_to_GTEX:
            GO_IDs = row[2:]
            for GO_ID in GO_IDs.dropna():
                if GO_ID in GO_list:
                    map_GO_to_GTEX[GO_ID] = list(set(map_GO_to_GTEX[GO_ID] + map_BTO_to_GTEX[tissue]))

    #inputFile.close()
    return map_GO_to_GTEX
def run():
    global mongo, scaler
    mongo = MongoWrapper(avito_utils.avito_db)
    scaler = prepare_scaler()

    batch_size = 8000
    name = 'imagemagick'

    pool = avito_utils.PoolWrapper()

    t0 = time()
    df = pd.read_csv('../input/ItemPairs_train.csv')
    delete_file_if_exists('features_%s_train.csv' % name)
    print 'read train set, start processing...'
    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        batch = process_batch(batch, pool)
        append_to_csv(batch, 'features_%s_train.csv' % name)
    print 'processing train set took %0.5fs' % (time() - t0)

    t0 = time()
    df =  pd.read_csv('../input/ItemPairs_test.csv')
    delete_file_if_exists('features_%s_test.csv' % name)
    print 'read test set, start processing...'
    for batch_no, batch in tqdm(list(prepare_batches(df, batch_size))):
        batch = process_batch(batch, pool)
        append_to_csv(batch, 'features_%s_test.csv' % name)
    print 'processing test set took %0.5fs' % (time() - t0)

    pool.close()
Пример #18
0
def order_hist(CreateGroupList,num,f):
    order = pd.read_csv('./B/jdata_user_order.csv', parse_dates=['o_date'])
    sku = pd.read_csv('./B/jdata_sku_basic_info.csv', )
    order = pd.merge(order, sku, on='sku_id', how='left')
    target_order = order[(order.cate == 101) | (order.cate == 30)].reset_index(drop=True)
    first_day = datetime.datetime.strptime('2016-08-31 00:00:00', '%Y-%m-%d %H:%M:%S')
    target_order['o_day_series'] = (target_order['o_date'] - first_day).apply(lambda x: x.days)

    target_order = target_order.sort_values(by=['user_id','o_day_series'], ascending=False).reset_index(drop=True)

    alld = []
    for CG in CreateGroupList:
        CreateGroup = CG
        t = target_order[target_order.o_day_series < CreateGroup]
        features =[]
        for i in range(num):
            t2 = t[['user_id',f]].groupby(['user_id']).shift(-i)
            t2.columns = t2.columns + '_{}'.format(i)
            features.append(t2.columns[0])
            t = pd.concat([t,t2],axis=1)
        x = t.drop_duplicates(subset=['user_id'])
        x = x[['user_id'] + features]
        x['CreateGroup'] = CreateGroup
        alld.append(x)
    df = pd.concat(alld).reset_index(drop=True)
#    print(np.unique(df.CreateGroup))
    return df
Пример #19
0
def read_input(**kwargs):
    """ Read CSV-files

    Parameters
    ----------
    **kwargs : key word arguments
        Arguments passed from command line

    Returns
    -------
    nodes_flows : DataFrame
        Containing data for nodes and flows.
    nodes_flows_seq: DataFrame
        Data for sequences.
    """

    nodes_flows = pd.read_csv(kwargs['NODE_DATA'], sep=kwargs['--sep'])
    nodes_flows_seq = pd.read_csv(kwargs['SEQ_DATA'],
                                  sep=kwargs['--sep'],
                                  header=None)
    nodes_flows_seq.dropna(axis=0, how='all', inplace=True)
    nodes_flows_seq.drop(0, axis=1, inplace=True)
    nodes_flows_seq = nodes_flows_seq.transpose()
    nodes_flows_seq.set_index([0, 1, 2, 3, 4], inplace=True)
    nodes_flows_seq.columns = range(0, len(nodes_flows_seq.columns))
    nodes_flows_seq = nodes_flows_seq.astype(float)

    return nodes_flows, nodes_flows_seq
Пример #20
0
def main():

    df = pd.read_csv("../OUTPUT/segmentation_results_k-means.csv", delimiter=",", skipinitialspace=True)

    df_api = pd.read_csv("../OUTPUT/usersInfoAPI.csv", delimiter=",", skipinitialspace=True)

    #aggrego male, female e null
    df_api["sesso"] = df_api["sesso"].replace("F", "f")
    df_api["sesso"] = df_api["sesso"].replace("M", "m")
    df_api["sesso"] = df_api["sesso"].replace("N", "n")
    df_api["sesso"] = df_api["sesso"].fillna('n')

    df_friends = pd.read_csv("../OUTPUT/network_degree_node.csv", delimiter=",", skipinitialspace=True)

    df_merged = pd.merge(df_api, df, left_on="user_id", right_on="user_id", how='right')

    df_merged = pd.merge(df_friends, df_merged, left_on="user_id", right_on="user_id", how='right')
    df_merged["sesso"] = df_merged["sesso"].fillna('n')
    # df_merged["data_reg"] = pd.to_datetime(df_merged['data_reg'])

    # print df_merged["degree_initial_network"].mean()
    # generi = df_merged["sesso"].values.tolist()
    # counter_sex = Counter(generi)
    # sex_dict = dict(counter_sex)
    # print sex_dict
    # # date_time = datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
    #
    # # print datetime.datetime.fromtimestamp(int(df_merged["data_reg"].mean()))
    # sys.exit()

    # plt.style.use("dark_background")

    k_means_analysis(df_merged)
Пример #21
0
	def read_file(self):
		# get the training data
		X_train_raw = pd.read_csv(self.file_dir + self.X_train_file)
		# we're going to do some sampling to get rid of skew in data 
		# first we'll get the row nums where the relevance is in a range of values
		# <2 (group1); <2.5 & >2 (group2); >2.5 & <3 (group3); == 3 (group4)
		X_train_g1 = X_train_raw.loc[X_train_raw['relevance'] < 2]
		X_train_g2 = X_train_raw.loc[(X_train_raw['relevance'] > 1.9) & (X_train_raw['relevance'] < 2.4)]
		X_train_g3 = X_train_raw.loc[(X_train_raw['relevance'] > 2.6) & (X_train_raw['relevance'] < 3)]
		X_train_g4 = X_train_raw.loc[X_train_raw['relevance'] == 3]
		# THEN we take samples based on those (so our final train data is proportional between the ranges)
		# final samples (w/out replacement)
		X_train_g2_s = X_train_g2.sample(n = X_train_g1.shape[0], replace=False)
		X_train_g3_s = X_train_g3.sample(n = X_train_g1.shape[0], replace=False)
		X_train_g4_s = X_train_g4.sample(n = X_train_g1.shape[0], replace=False)
		# stack them up: this is our final X_train 
		X_train_comp = X_train_g1.append(X_train_g2_s)
		X_train_comp.append(X_train_g3_s)
		X_train_comp.append(X_train_g4_s)
		self.X_train = X_train_comp.drop(['id', 'product_uid', 'relevance'], axis=1)
		self.y_train = X_train_comp['relevance']
		# get the testing data 
		X_test_raw = pd.read_csv(self.file_dir + self.X_test_file)
		self.X_test = X_test_raw.drop(['id', 'product_uid'], axis=1)
		self.fin_df = X_test_raw.drop(['product_uid', 'prod_query_raw_cosine_tfidf', 'prod_query_fixes_cosine_tfidf','des_query_raw_cosine_tfidf','des_query_fixes_cosine_tfidf','kw_matches_overall', 'kw_matches_title', 'kw_matches_des'], axis=1)
Пример #22
0
 def _read_data(self, data):
     if isinstance(data, pd.core.frame.DataFrame):
         tax_dta = data
     elif isinstance(data, str):
         if data.endswith("gz"):
             tax_dta = pd.read_csv(data, compression='gzip')
         else:
             tax_dta = pd.read_csv(data)
     else:
         msg = ('Records.constructor data is neither a string nor '
                'a Pandas DataFrame')
         raise ValueError(msg)
     # remove the aggregated record from 2009 PUF
     tax_dta = tax_dta[tax_dta.recid != 999999]
     self.dim = len(tax_dta)
     # create variables in NAMES list
     for attrname, varname in Records.NAMES:
         setattr(self, attrname, tax_dta[varname].values)
     for name in Records.ZEROED_NAMES:
         setattr(self, name, np.zeros((self.dim,)))
     self._num = np.ones((self.dim,))
     # specify eNNNNN aliases for several pNNNNN and sNNNNN variables
     self.e22250 = self.p22250
     self.e04470 = self.p04470
     self.e23250 = self.p23250
     self.e25470 = self.p25470
     self.e08000 = self.p08000
     self.e60100 = self.p60100
     self.e27860 = self.s27860
     # specify SOIYR
     self.SOIYR = np.repeat(Records.PUF_YEAR, self.dim)
Пример #23
0
def create_filtered_matod(city):	
	# read nodes
	print('Reading nodes')
	fid = '/home/michael/mit/ods_and_roads/%s/%s_nodes_algbformat.txt'%(city, city)
	nodes = pd.read_csv(fid, sep=' ')
	N = nodes.nid.as_matrix()

	print('Reading MatOD')
	fid = '/home/michael/mit/ods_and_roads/%s/%s_interod_0_1.txt' %(city, city)
	matod = pd.read_csv(fid, sep=' ')

	print('Filtering')
	o = matod.o.as_matrix()
	d = matod.d.as_matrix()
	b = [False] * len(o)
	c = 0
	for k in range(len(o)):
		if o[k] in N and d[k] in N:
			b[k] = True
			c += 1
	print('Number of excluded edges %d of %d' %(len(o) - c, len(o)))
	matod = matod[b]

	print('Saving file')
	fid = '/home/michael/mit/instances/tables/%s_table_od.csv' % city
	matod.to_csv(fid, sep=' ', index=False)
	print('Done')
Пример #24
0
  def test_spread_2(self):
    input_df = DplyFrame(pd.read_csv(StringIO("""country,year,key,value
1,Afghanistan,1999,cases,745
2,Afghanistan,1999,population,19987071
3,Afghanistan,2000,cases,2666
4,Afghanistan,2000,population,20595360
5,Brazil,1999,cases,37737
6,Brazil,1999,population,172006362
7,Brazil,2000,cases,80488
8,Brazil,2000,population,174504898
9,China,1999,cases,212258
10,China,1999,population,1272915272
11,China,2000,cases,213766
12,China,2000,population,1280428583""")))
    input_pd = DplyFrame(pd.read_csv(StringIO("""country,year,cases,population
Afghanistan,1999,745,19987071
Afghanistan,2000,2666,20595360
Brazil,1999,37737,172006362
Brazil,2000,80488,174504898
China,1999,212258,1272915272
China,2000,213766,1280428583""")))
    spread_test_df_1 = input_df >> spread(X.key, X.value)
    spread_test_df_2 = spread(input_df, X.key, X.value)
    spread_test_df_3 = input_df >> group_by(X.key) >> spread(X.key, X.value)
    self.assertTrue(input_pd.equals(spread_test_df_1))
    self.assertTrue(input_pd.equals(spread_test_df_2))
    self.assertTrue(input_pd.equals(spread_test_df_3))
Пример #25
0
def predict():
    converters = dict(DRUNK_DR=convertDD, RAIL=convertRAIL, TWAY_ID=convertTWAYID)
    acc_train_df = pandas.read_csv('accident_train.csv', converters=converters)
    acc_train_df = acc_train_df.fillna(0)
    acc_test_df = pandas.read_csv('accident_test.csv', converters=converters)
    acc_test_df = acc_test_df.fillna(0)
    ids = acc_test_df['ID'].get_values()
    print "CSVs read in"

    columns = list(acc_train_df.columns)
    for c in columns_to_remove:
        print c
        columns.remove(c)
    columns.remove("YEAR") # test data doesn't have this key for some reason
    labels = acc_train_df['DRUNK_DR'].get_values()
    data_train = acc_train_df[columns]
    acc_test_df = acc_test_df[columns]

    xgtrain = xgboost.DMatrix(data_train, label=labels)
    xgtest = xgboost.DMatrix(acc_test_df)

    watchlist = [(xgtrain, 'train')]
    bst = xgboost.train(params, xgtrain, num_rounds, watchlist)
    preds = modifyPreds(bst.predict(xgtest))
    
    with open('submission.csv', 'w') as f:
        f.write("ID,DRUNK_DR\n")
        for i, id_ in enumerate(ids):
            f.write("{},{}\n".format(id_, preds[i]))
Пример #26
0
  def test_semi_join_dplyr_2(self):
    # bivariate keys
    j_test_1 = self.c >> semi_join(self.d)
    j_test_2 = self.d >> semi_join(self.c)
    j_pd_1 = DplyFrame(pd.read_csv(StringIO("""x,y,a
1,1,1
1,1,2
2,2,3""")))
    j_pd_2 = DplyFrame(pd.read_csv(StringIO("""x,y,b
1,1,1
2,2,2
2,2,3""")))
    self.assertTrue(j_test_1.equals(j_pd_1))
    self.assertTrue(j_test_2.equals(j_pd_2))
    # include column names
    j_test_1 = self.c >> semi_join(self.d, by=['x', 'y'])
    j_test_2 = self.d >> semi_join(self.c, by=['x', 'y'])
    self.assertTrue(j_test_1.equals(j_pd_1))
    self.assertTrue(j_test_2.equals(j_pd_2))
    # use different column names
    alt_c = self.c.rename(columns={'x': 'x_2'})
    j_test_1 = alt_c >> semi_join(self.d, by=[('x_2', 'x'), 'y'])
    j_test_2 = self.d >> semi_join(alt_c, by=[('x', 'x_2'), 'y'])
    j_pd_1 = DplyFrame(pd.read_csv(StringIO("""x_2,y,a
1,1,1
1,1,2
2,2,3""")))
    self.assertTrue(j_test_1.equals(j_pd_1))
    self.assertTrue(j_test_2.equals(j_pd_2))
Пример #27
0
  def test_anti_join_dplyr_2(self):
    # bivariate keys
    j_test_1 = self.c >> anti_join(self.d)
    j_test_2 = self.d >> anti_join(self.c)
    j_pd_1 = DplyFrame(pd.read_csv(StringIO("""index,x,y,a
3,3,3,4""")).set_index(['index']))
    j_pd_1.index.name = None
    j_pd_2 = DplyFrame(pd.read_csv(StringIO("""index,x,y,b
3,4,4,4""")).set_index(['index']))
    j_pd_2.index.name = None
    self.assertTrue(j_test_1.equals(j_pd_1))
    self.assertTrue(j_test_2.equals(j_pd_2))
    # use column names
    j_test_1 = self.c >> anti_join(self.d, by=['x', 'y'])
    j_test_2 = self.d >> anti_join(self.c, by=['x', 'y'])
    self.assertTrue(j_test_1.equals(j_pd_1))
    self.assertTrue(j_test_2.equals(j_pd_2))
    # use different column names
    alt_c = self.c.rename(columns={'x': 'x_2'})
    j_test_1 = alt_c >> anti_join(self.d, by=[('x_2', 'x'), 'y'])
    j_test_2 = self.d >> anti_join(alt_c, by=[('x', 'x_2'), 'y'])
    j_pd_1 = DplyFrame(pd.read_csv(StringIO("""index,x_2,y,a
3,3,3,4""")).set_index(['index']))
    j_pd_1.index.name = None
    self.assertTrue(j_test_1.equals(j_pd_1))
    self.assertTrue(j_test_2.equals(j_pd_2))
Пример #28
0
    def get_ticks_by_date(self, symbol, begin_date, end_date, hours="regular", parse_dates=False, nrows=None):
        dates = self.parse_dates(begin_date, end_date)

        suffix = self.get_file_suffix(hours)
        filenames = [symbol + s + ".csv.gz" for s in suffix]

        if parse_dates:
            tick_data = pd.DataFrame(columns=["type", "price", "size", "exch", "cond"])
        else:
            tick_data = pd.DataFrame(columns=["datetime", "type", "price", "size", "exch", "cond"])

        for date in dates:
            for filename in filenames:
                data_path = os.path.join(self.base_dir, date, filename)
                if not os.path.exists(data_path):
                    continue
                    #print "cannot find", data_path
                    #raise IOException("Data file not found: %s" % data_path)
                if parse_dates:
                    dateparse = lambda x: pd.datetime.strptime(x+"000", '%m/%d/%Y %H:%M:%S.%f')
                    cur_ticks = pd.read_csv(data_path, compression="gzip", names=["datetime", "type", "price", "size", "exch", "cond"], parse_dates=[0], date_parser=dateparse, index_col=0, nrows=nrows)
#                    cur_ticks = pd.read_csv(data_path, compression="gzip", names=["datetime", "type", "price", "size", "exch", "cond"], parse_dates=[0], index_col=0, nrows=nrows)
                else:
                    cur_ticks = pd.read_csv(data_path, compression="gzip", names=["datetime", "type", "price", "size", "exch", "cond"], nrows=nrows)
                tick_data = tick_data.append(cur_ticks)
        return tick_data
Пример #29
0
    def load_annotations(self):
        self.num_annotators = 0

        self.annotations = []
        self.locations = []

        self.targets = None

        targets_file_name = os.path.join(self.path, 'targets.csv')
        if os.path.exists(targets_file_name):
            self.targets = pd.read_csv(targets_file_name)

        while True:
            annotation_filename = "{}/annotations_{}.csv".format(self.path, self.num_annotators)
            location_filename = "{}/location_{}.csv".format(self.path, self.num_annotators)

            if not os.path.exists(annotation_filename):
                break

            self.annotations.append(pd.read_csv(annotation_filename))
            self.locations.append(pd.read_csv(location_filename))

            self.num_annotators += 1

        self.annotations_loaded = self.num_annotators != 0
Пример #30
0
def main():
	# Get the data and targets
	df = pd.read_csv('train1.csv')
	df = df[df.rating != 'rating']
	corpus = [review for review in df.review]
	splitPoint = len(corpus)*2/3
	trainingCorpus = corpus[:splitPoint]
	testCorpus = corpus[splitPoint:]
	target = [rating for rating in df.rating]
	trainingTarget = np.array(target[:splitPoint])
	testTarget = np.array(target[splitPoint:])

	# Train the algorithm
	train_X, vocabList = createVectorizer(trainingCorpus, 'None', True)
	NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget)

	# Test the algorithm
	test_X = createVectorizer(testCorpus, vocabList, True)
	test_predict = NB_Bern_model.predict(test_X)
	print(np.mean(test_predict == testTarget))	
	print metrics.classification_report(testTarget, test_predict, target_names=['0', '1'])

	# Make Predictions
	predict_df = pd.read_csv('test2.csv')
	predictCorpus = [review for review in predict_df.review]
	member = [memberid for memberid in predict_df.ID]
	predict_X = createVectorizer(predictCorpus, vocabList, True)
	predictions = NB_Bern_model.predict(predict_X)
	predict_df.columns = ['ID', 'Predicted']
	for i in range(len(member)):
	 	predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i]
	predict_df.to_csv('submission1.csv', sep = ',', index=False)
Пример #31
0
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score


def run_naive_bayes(X_train, y_train, X_test, y_test, _alpha=0.5):
    # clf = MultinomialNB(alpha=_alpha)
    clf = GaussianNB()
    clf.fit(X_train, y_train)

    predictions_count = clf.predict(X_test)
    print("f1 score: ", f1_score(y_test, predictions_count))
    print("accuracy score: ", accuracy_score(y_test, predictions_count))


if __name__ == "__main__":
    # read data
    df_train = pd.read_csv("../data/train_opt.csv", sep=',')
    df_train['Comment'] = df_train['Comment'].fillna(' ')
    df_test = pd.read_csv("../data/test_opt.csv", sep=',')
    df_test['Comment'] = df_test['Comment'].fillna(' ')

    # labels
    y_train = df_train['Insult']
    y_test = df_test['Insult']

    count_vectorizer = CountVectorizer(min_df=3)
    X_train = count_vectorizer.fit_transform(df_train['Comment'])
    X_test = count_vectorizer.transform(df_test['Comment'])

    run_naive_bayes(X_train.toarray(), y_train, X_test.toarray(), y_test, 1)
Пример #32
0
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error



print("AIRFOIL SELF-NOISE")

names = ['Frequency','Angle-Attack','Chord-Length','Free-stream-velocity','Suction-thickness','SSPresure-level']
data = pd.read_csv('./datos/airfoil_self_noise.dat',names = names,sep="\t")


print("PREPROCESADO")


print("Matriz de correlación")
corr_matrix = data.corr()
k = 6 # Número de variables en el heatmap
cols = corr_matrix.nlargest(k, 'Frequency')['Frequency'].index
cm = np.corrcoef(data[cols].values.T)
plt.subplots(figsize=(9,9))
sns.set(font_scale=0.75)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
Пример #33
0
import numpy as np 
import pandas as pd 
import tensorflow as tf
import nltk
import pickle
from nltk.corpus import stopwords

train = pd.read_csv('conversation/data/train.csv')
x_train = train.iloc[:, 0].values
y_train = train.iloc[:, 1:2].values

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

from nltk.stem.porter import PorterStemmer
import re

stemmer = PorterStemmer()
corpus = []
for i in x_train:
	text = re.sub('[^a-zA-Z]', ' ', i)
	text = text.lower()
	text = text.split()
	text = [stemmer.stem(word) for word in text if word not in set(all_stopwords)]
	text = ' '.join(text)
	corpus.append(text)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_train = cv.fit_transform(corpus)
pickle.dump(cv, open('conversation/save/count_vectorizer.pickle', 'wb'))
Пример #34
0
import optuna
from sklearn.metrics import log_loss
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

train = pd.read_csv('../titanic/train.csv')
test = pd.read_csv('../titanic/test.csv')
sub = pd.read_csv('../titanic/gender_submission.csv')

data = pd.concat([train, test], sort=False)

data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna('S', inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)

age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

print(data.head())

train = data[:len(train)]
test = data[len(train):]
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)
Пример #35
0
#PROCESAMIENTO
import time
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
#CARGAR LOS DATOS
os.getcwd()

os.chdir("C://Users//Sony//Desktop//TESIS 2")

df = pd.read_csv('CIC_AWS_Filtrado.csv')
df.head(10)

#PREPARAR LOS DATOS
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
import sklearn.metrics
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics

df1 = df[['Dst_Port','Protocol','Flow_Duration','Tot_Fwd_Pkts','Tot_Bwd_Pkts',
          'TotLen_Fwd_Pkts','TotLen_Bwd_Pkts','Fwd_Pkt_Len_Mean','Fwd Pkt Len Max','Fwd Pkt Len Min',
Пример #36
0
import numpy as np
import pandas as pd


def checksum(m):
    rows = len(m)
    cols = len(m[0])

    diff_sum = 0
    for i in range(rows):
        min = 1000000
        max = 0
        for j in range(cols):
            if m[i, j] <= min:
                min = m[i, j]
            if m[i, j] >= max:
                max = m[i, j]
        diff_sum += abs(max - min)

    return diff_sum


csvfile = pd.read_csv("day2_input.csv", sep="\t", header=None)
m = np.array(csvfile)

print("The matrix is: ")
print(m)
answer = checksum(m)
print("The checksum is: " + str(answer))
"""Simulation file used to run the model"""
import time
from spillover_model_calRA import *
from spillover_model import *
from calibration_functions import *
import pandas as pd
from stochasticprocess import *
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import math



df_inflation = pd.read_csv('C:\Users\jrr\Dropbox\International Spillovers\Data\inflation\CPI_96.csv')
df_interest= pd.read_excel('C:\Users\jrr\Dropbox\International Spillovers\Data\interest_rates\deposit_rates.xls',sheet_name='data')
df_penn= pd.read_excel('C:\Users\jrr\Dropbox\International Spillovers\Data\inflation\penworldtable90.xlsx', Sheet_name="Sheet5")
df_penn = df_penn.drop(df_penn[df_penn.year != 2014].index)

ROW_countries = ['Argentina', 'Australia', 'Bermuda', 'Botswana', 'Brazil', 'Canada', 'Chile', 'China', 'Colombia', 'Czech Republic', 'Denmark', 'HongKong', 'Hungary', 'India', 'Indonesia', 'Israel', 'Japan', 'Kuwait', 'Lebanon', 'Liechtenstein', 'Malaysia', 'Mexico', 'Monaco', 'Namibia', 'New Zealand', 'Norway', 'Oman', 'Pakistan', 'Peru', 'Philippines', 'Puerto Rico', 'Poland', 'Russia', 'Singapore', 'South Africa', 'Korea', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'United Kingdom', 'United States', 'Venezuela', 'Vietnam']


inflation = {df_inflation.iloc[i][0]:df_inflation.iloc[i][20] for i in range(len(df_inflation))}
interest = {df_interest.iloc[i][0]:df_interest.iloc[i][3] for i in range(len(df_interest))}
rgdp = {df_penn.iloc[i][1]: df_penn.iloc[i][5]  for i in range(len(df_penn))}

inf = {}
int = {}
gdp = {}
for i in ROW_countries:
    try:
        if math.isnan(float(inflation[i])) != True:
Пример #38
0
    def simulatedata(self):
        nr.seed(seed=79819)
        plt.close('all')

        QUIC20 = pd.read_csv(fname)
        #startTime = pd.Timestamp(dt.datetime(2014, 07, 05, 12, 00, 20))
        #endTime = pd.Timestamp(dt.datetime(2014, 07, 05, 18,  00, 00))
        startTime = pd.Timestamp(dt.datetime(2014, 11, 01, 00, 00, 00))
        endTime = pd.Timestamp(dt.datetime(2014, 11, 31, 00, 00, 00))
        TimeStamp = pd.date_range(startTime, endTime, freq='20s')
        QUIC20['dateTime'] = pd.date_range(startTime, endTime, freq='20s')
        QUIC20.BaseSim = QUIC20.BaseSim * 10**7
        QUIC20.RemoteSim = QUIC20.RemoteSim * 10**7
        QUIC20.index = QUIC20.dateTime
        QUIC = QUIC20.resample('1s', fill_method='pad')
        QUIC['dateTime'] = QUIC.index.copy()
        rawdat = importSPOD(datafolder, 1, startTime, endTime)
        QUIC['U'] = rawdat['U']
        QUIC['V'] = rawdat['V']
        QUIC['WS'] = rawdat['WS']
        QUIC['Time'] = pd.to_datetime(
            QUIC.index.copy()).astype('int').astype(float) / (10**18)

        # Simulate Data
        Num = len(QUIC)
        QUIC['BaseBM'] = genBrownianBridge(Num) + 1.5

        NewBase = QUIC.BaseSim + QUIC.BaseBM
        NewBase[NewBase > 5] = 5
        NewBase[NewBase < 0.25] = nr.randn(len(
            NewBase[NewBase < 0.25])) * .05 + 0.25
        QUIC['Base'] = NewBase

        QUIC['RemoteBM'] = genBrownianBridge(Num)
        NewRemote = QUIC.RemoteSim + QUIC.RemoteBM
        NewRemote[NewRemote > 5] = 5
        NewRemote[NewRemote < 0.25] = nr.randn(len(
            NewRemote[NewRemote < 0.25])) * .05 + 0.25
        QUIC['Remote'] = NewRemote

        # Plot Simulated Data
        font = {'weight': 'bold', 'size': 8}
        mpl.rc('font', **font)

        baseTotal = ggplot(aes(x='dateTime', y='Base'), data=QUIC) +\
            geom_line() +\
            ylim(0,5) +\
            geom_line() + xlab("") + ylab("Simulated Signal (V)")

        baseRand = ggplot(aes(x='dateTime', y='BaseBM'), data=QUIC) +\
            geom_line() + xlab("") + ylab("Stochastic Baseline")

        baseSim = ggplot(aes(x='dateTime', y='BaseSim'), data=QUIC) +\
            geom_line()+\
            ylim(0,5) + xlab("") + ylab("Simulated Signal (V)")

        remoteTotal = ggplot(aes(x='dateTime', y='Remote'), data=QUIC) +\
            geom_line() + xlab("") + ylab("Simulated Signal (V)")
        #   theme_matplotlib(mpl.rc('font', **font), matplotlib_defaults=False)

        ggsave(plot=baseTotal,
               filename=figfolder + 'BaseTotal.png',
               width=8,
               height=2)
        ggsave(plot=baseRand,
               filename=figfolder + 'BaseRand.png',
               width=8,
               height=2)
        ggsave(plot=baseSim,
               filename=figfolder + 'BaseSim.png',
               width=8,
               height=2)
        ggsave(plot=remoteTotal,
               filename=figfolder + 'remoteTotal.png',
               width=8,
               height=2)

        # Illustrate Method
        fitMinSpline(QUIC['Base'][QUIC.index.min():QUIC.index.min() +
                                  pd.Timedelta(freqT, 'h')],
                     QUIC['Time'][QUIC.index.min():QUIC.index.min() +
                                  pd.Timedelta(freqT, 'h')],
                     smoothingWindow,
                     plot=True,
                     plotVar=QUIC.dateTime)
        ggsave(filename=figfolder + 'Spline_fit.png', width=8, height=2)

        QUICFilt = applyFilters(
            QUIC[QUIC.index.min():QUIC.index.min() + pd.Timedelta(freqT, 'h')],
            thresh1, thresh2, smoothingWindow)
        butterplot = ggplot(aes(x='dateTime', y='butterBase'), data=QUICFilt) + geom_line() +\
                    ylim(0,5) +\
                    xlab('') + ylab('Sensor after Butterworth')
        ggsave(plot=butterplot,
               filename=figfolder + 'Butterworth_filt.png',
               width=8,
               height=2)

        # Apply algorithm
        QUIC['TrueBase'] = QUIC.BaseSim.apply(isSignal, args=(0.01, ))
        QUIC['TrueRemote'] = QUIC.RemoteSim.apply(isSignal, args=(0.01, ))

        FiltAvg = piecewiseImportSpod(startTime, endTime, freq, avgTime,
                                      thresh1, thresh2, smoothingWindow, QUIC,
                                      True)

        remoteTotal = ggplot(aes(x='dateTime', y='Remote'), data=QUIC) +\
            geom_line() +\
            theme_matplotlib(mpl.rc('font', **font), matplotlib_defaults=False)

        TrueVDetect = ggplot(aes(x='TrueBase', y='butterBaseSignal'), data=FiltAvg) +\
            geom_point(color = 'blue') +\
            geom_point(aes(x = 'TrueRemote', y='butterRemoteSignal'), color = 'blue') +\
            geom_point(aes(y='splineBaseSignal'), color='green') +\
            geom_point(aes(x = 'TrueRemote', y='splineRemoteSignal'), color='green') +\
            geom_abline(aes(intercept = 0, slope=1)) +\
            ylab('Detected Signal 5 min mean') +\
            xlab('True Signal 5 min mean')

        ggsave(plot=TrueVDetect,
               filename=figfolder + 'TrueVDetect.png',
               width=4.5,
               height=4)

        ButterCorrect = (len(FiltAvg[(FiltAvg.butterBaseSignal > 0.017) &
                                     (FiltAvg.TrueBase > 0.017)]) +
                         len(FiltAvg[(FiltAvg.butterRemoteSignal > 0.017) &
                                     (FiltAvg.TrueRemote > 0.017)])) / (
                                         2.0 * len(FiltAvg))
        print("Butter percent correct: " + str(ButterCorrect))

        SplineCorrect = (len(FiltAvg[(FiltAvg.splineBaseSignal > 0.017) &
                                     (FiltAvg.TrueBase > 0.017)]) +
                         len(FiltAvg[(FiltAvg.splineRemoteSignal > 0.017) &
                                     (FiltAvg.TrueRemote > 0.017)])) / (
                                         2.0 * len(FiltAvg))
        print("Spline percent correct: " + str(SplineCorrect))

        ButterFalsePos = (len(FiltAvg[(FiltAvg.butterBaseSignal > 0.017) &
                                      (FiltAvg.TrueBase < 0.017)]) +
                          len(FiltAvg[(FiltAvg.butterRemoteSignal > 0.017) &
                                      (FiltAvg.TrueRemote < 0.017)])) / (
                                          2.0 * len(FiltAvg))
        print("Butter percent false pos: " + str(ButterFalsePos))

        SplineFalsePos = (len(FiltAvg[(FiltAvg.splineBaseSignal > 0.017) &
                                      (FiltAvg.TrueBase < 0.017)]) +
                          len(FiltAvg[(FiltAvg.splineRemoteSignal > 0.017) &
                                      (FiltAvg.TrueRemote < 0.017)])) / (
                                          2.0 * len(FiltAvg))
        print("Spline percent false pos: " + str(SplineFalsePos))
Пример #39
0
def get_precip_data():
    return pd.read_csv('precipitation.csv', parse_dates=[2])
Пример #40
0
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
log_path.mkdir(parents=True, exist_ok=True)

mid_path = Path(f"../mid/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
mid_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
print("start data loading")
# train = unpickle("../processed/v003/v003_098/train_compact_v003_098.pkl")
# test = unpickle("../processed/v003/v003_098/test_compact_v003_098.pkl")
train = unpickle(
    "../processed/v003/v003_104/train_compact_v003_104_compact.pkl")
test = unpickle("../processed/v003/v003_104/test_compact_v003_104_compact.pkl")

train_ = pd.read_csv("../input/train.csv")
train_id = train_.id
mol_name = train_.molecule_name
scalar_coupling_constant = train_.scalar_coupling_constant
scalar_coupling_contributions = pd.read_csv(
    f'../input/scalar_coupling_contributions.csv')
fc = scalar_coupling_contributions.fc
del train_
del scalar_coupling_contributions

# feat_train = unpickle("../processed/v003/atom_3J_substituents1_train_na.pkl")
# feat_test = unpickle("../processed/v003/atom_3J_substituents1_test_na.pkl")
# train = pd.concat([train, feat_train], axis=1)
# test  = pd.concat([test, feat_test], axis=1)
# assert len(train) == 4658147
# assert len(test)  == 2505542
Пример #41
0
from datetime import datetime

import os

import tensorflow

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.layers import Flatten, Dropout, Activation
import matplotlib.pyplot as plt
from tensorflow.keras.models import model_from_json

dfData = pd.read_csv('drive/My Drive/VoiceData/all_Data.csv')

dfData.head()

X = dfData.loc[:, dfData.columns != 'label']
y = dfData['label']

lb = preprocessing.LabelEncoder()
y = lb.fit_transform(y)

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

scaler.fit(X)

x = scaler.transform(X)
Пример #42
0
from scipy.stats import gaussian_kde

def get_lng(x):
    
    lng = re.findall("\d+.\d+",x)[0]
    return lng
    
def get_lat(x):
    
    lat = re.findall("\d+.\d+",x)[1]
    return lat
    
    
if __name__ == "__main__":
    
    df = pd.read_csv("failure_report.csv")
    lng_ = df["经纬度"].apply(lambda x : get_lng(x))
    lat_ = df["经纬度"].apply(lambda x : get_lat(x))
    
    x = lng_.astype(np.float64).values
    y = lat_.astype(np.float64).values
    
    '''
    longitude range: 120.65961082469951 120.86540172553127
    latitude range: 31.24660722778891 31.42576791825509
    # 事故分布散点图
    plt.scatter(x,y)
    plt.xlim(120.65961082469951,120.86540172553127)
    plt.ylim(31.24660722778891,31.42576791825509)
    plt.show()
    '''
Пример #43
0
from dyneusr import DyNeuGraph
from dyneusr.tools import visualize_mapper_stages

# Fetch dataset, extract time-series from ventral temporal (VT) mask
dataset = fetch_haxby()
masker = NiftiMasker(
    dataset.mask_vt[0], 
    standardize=True, detrend=True, smoothing_fwhm=4.0,
    low_pass=0.09, high_pass=0.008, t_r=2.5,
    memory="nilearn_cache"
    )
X = masker.fit_transform(dataset.func[0])

# Encode labels as integers
df = pd.read_csv(dataset.session_target[0], sep=" ")
target, labels = pd.factorize(df.labels.values)
y = pd.DataFrame({l:(target==i).astype(int) for i,l in enumerate(labels)})

# Generate shape graph using KeplerMapper
mapper = KeplerMapper(verbose=1)
lens = mapper.fit_transform(X, projection=TSNE(2, random_state=1))
graph = mapper.map(
    lens, X=X, 
    cover=Cover(20, 0.5), 
    clusterer=DBSCAN(eps=20.)
    )

# Visualize the shape graph using DyNeuSR's DyNeuGraph
dG = DyNeuGraph(G=graph, y=y)
dG.visualize('dyneusr4D_haxby_decoding.html', template='4D', static=True, show=True)   
Пример #44
0
    counts.columns.name = 'month'
    
    return totals, counts


def main():
    data = get_precip_data()
    totals, counts = pivot_months_loops(data)
    totals.to_csv('totals.csv')
    counts.to_csv('counts.csv')
    np.savez('monthdata.npz', totals=totals.values, counts=counts.values)


if __name__ == '__main__':
    main()

totals = pd.read_csv('totals.csv').set_index(keys=['name'])
counts = pd.read_csv('counts.csv').set_index(keys=['name'])


# Recreating totals
data = get_precip_data()

totals2, counts2 = pivot_months_pandas(data)

print(totals2)
print(totals)

print(counts2)
print(counts)
Пример #45
0
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from EmotionClassifer import EmotionClassifier
root_dir = "/users/imishra/workspace/EmotionDetection"
# Read the data
data_raw = pd.read_csv(root_dir+'/data/isear.csv', error_bad_lines=False, sep="|")
data = pd.DataFrame({'content': data_raw['SIT'], 'sentiment': data_raw['Field1']})

# Clean and transform the data
max_num_words = 4000
max_text_length = 1000
embed_dim = 128
lstm_units = 128
emotionClassifier = EmotionClassifier()
data['content'] = data['content'].apply(EmotionClassifier.clean_text)
data['sentiment_label'] = [emotionClassifier.emotions_labels_map[sentiment] for sentiment in data['sentiment']]

# Create and train the model
emotionClassifier.create_tokenizer(data['content'], max_num_words, max_text_length)
feature_vectors = emotionClassifier.map_features(data['content'])
labels = np.array(data['sentiment_label']).reshape(-1, 1)
emotionClassifier.fit_label_encoder(labels)
labels = emotionClassifier.encode_labels(labels)
emotionClassifier.create_model(embed_dim, lstm_units)
emotionClassifier.compile_model(loss_function='categorical_crossentropy', optimizer='rmsprop', metrics='accuracy')
X_train, X_valid, Y_train, Y_valid = train_test_split(feature_vectors, labels, test_size=0.2, random_state=42)
emotionClassifier.train(X_train, Y_train, X_valid, Y_valid, batch_size=128, epochs=30, verbose=2)
Пример #46
0
import pandas as pd
import os
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import MultiLabelBinarizer

data = []
outputs_dir = os.path.join(os.getcwd(), "outputs")

for file in os.listdir(outputs_dir):
    df = pd.read_csv(os.path.join(outputs_dir, file))
    df = df.dropna()
    df = df.drop(["Frame Number", "time_relative"], axis=1)

    df['IMSI'] = df['IMSI'].astype(str)
    df['enb_ue_s1ap_id'] = df['enb_ue_s1ap_id'].astype(str)
    df['mme_ue_s1ap_id'] = df['mme_ue_s1ap_id'].astype(str)

    df = pd.concat(
        [df.drop('protocols', 1), df['protocols'].str.get_dummies(sep="|")], 1)
    df = pd.concat(
        [df.drop('cellidentity', 1), df['cellidentity'].str.get_dummies()], 1)
    df = pd.concat(
        [df.drop('enb_ue_s1ap_id', 1), df['enb_ue_s1ap_id'].str.get_dummies()],
        1)
    df = pd.concat(
        [df.drop('mme_ue_s1ap_id', 1), df['mme_ue_s1ap_id'].str.get_dummies()],
Пример #47
0
#####################################################
# Initial Set Up

# for dash and plotting capabilities
import dash
import dash_core_components as dcc  # for accessing interactive data visualization with plotly.js
import dash_html_components as html  # for accessing html elements h1 h2
import plotly.graph_objs as go  # for designing Chloropleth map

# for reading in data
import pandas as pd
import json

# read in csv file for data analysis
df = pd.read_csv('../data_set/M_Landings_cleaned.csv')
print(df.head(10))

# Read in geojson data
with open('../data_set/coordinates.json', 'r') as json_data:
    df_coordinates = json.load(json_data)
print(type(df_coordinates))
# print(df_coordinates['features'][:])
# mapbox token for mapping choropleth map
mapbox_accesstoken = 'pk.eyJ1IjoiY3JhaWdtYXJpYW5pIiwiYSI6ImNrNTMyM2l4MDA0NHMzbHF2NTI0aHdoMzQifQ.l4cSBnBuWaV49cs1XF4MoA'

##################################################################
# Create plotly figure

# for names in our bar chart
meteors = df['name'].str.title().tolist()
Пример #48
0
import matplotlib.pyplot as plt
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx.common.data_types import Int64TensorType

titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)
X = data.drop('survived', axis=1)
y = data['survived']
print(data.dtypes)

# SimpleImputer on string is not available for
# string in ONNX-ML specifications.
# So we do it beforehand.
for cat in ['embarked', 'sex', 'pclass']:
    X[cat].fillna('missing', inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
import pandas as pd
import sys
import numpy as np
import gc

def mag(df):
    return np.linalg.norm(df[['x','y','z']])

def is_active(res):
    return res['mag_diff']['var'] > 1e-07

f = sys.argv[1]
df_o = pd.read_csv(f)

df_o.set_index(pd.to_datetime(df_o['timestamp']), inplace=True)
df_o.sort_values(by='timestamp', inplace=True)

dfg = df_o.groupby(pd.TimeGrouper('D'))
not_wearing_times = []
for df in dfg:
    df = df[1]
    df = df.rolling('480s').mean()
    df.dropna(inplace=True)
    df.columns = ['x', 'y','z']
    df=df.resample('480s').mean()
    try:
        df['mag'] =df.apply(mag, axis=1)
        df['mag_diff'] = df['mag'].diff()
        df.dropna(inplace=True)
        res = df.groupby(pd.Grouper(freq='24Min')).agg(['var'])
        res['wearing'] =res.apply(is_active, axis=1)
Пример #50
0

def cal_dist(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))
    distance = 6378.137 * c
    return distance


food = '/Users/molly/Documents/NUS/2ndSemester/Projects/CS5224/Cents_trip/dataset/food.csv'
airbnb = '/Users/molly/Documents/NUS/2ndSemester/Projects/CS5224/Cents_trip/dataset/airbnb.csv'

food_df = pd.read_csv(food)
airbnb_df = pd.read_csv(airbnb)

food_data = food_df.iloc[:, [0, 6, 7]]
airbnb_data = airbnb_df.iloc[:, [0, 2, 3]]
foodid = food_data['FOODID'].as_matrix()
#print(type(foodid[0]))
lat_food = food_data['LATITUDE'].as_matrix()
lng_food = food_data['LONGITUDE'].as_matrix()

roomid = airbnb_data['ROOMID'].as_matrix()
#print(type(roomid[0]))
lat_airbnb = airbnb_data['LATITUDE'].as_matrix()
lng_airbnb = airbnb_data['LONGITUDE'].as_matrix()

distances = []
Пример #51
0
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import pysal as ps

#world = gpd.read_file(gpd.datasets.get_path())

us_income = pd.read_csv(ps.examples.get_path('usjoin.csv'))
print(us_income)

us_income_shape = gpd.read_file(ps.examples.get_path('us48.shx'))

#us_income_shape.plot()
Пример #52
0
                print('Exception: ', ex)
                print('Total API Calls: ', count)
                break


# Get 2000 township census data
# st_cnty_fips_00 = pd.read_csv('/Users/salma/Studies/Research/Criminal_Justice/research_projects/main_census_merge/data/wip_merge_files/st_cnty_fips_2000.csv')
# get_census_data_from_api('https://api.census.gov/data/2000/sf1', st_cnty_fips_00, 'new_census_townships_00_initial') # 3141 calls

# Get 2010 township census data
# st_cnty_fips_10 = pd.read_csv('/Users/salma/Studies/Research/Criminal_Justice/research_projects/main_census_merge/data/wip_merge_files/st_cnty_fips_10_temp.csv')
# get_census_data_from_api('https://api.census.gov/data/2010/dec/sf1', st_cnty_fips_10, 'new_census_townships_10_initial_16th', 2010)

# Get 2010 township census data
fips_90 = pd.read_csv(
    '/Users/salma/Studies/Research/Criminal_Justice/research_projects/US_Crime_Analytics/data/wip_merge_files/st_cnty_fips_1990.csv'
)
#st_cnty_fips_90 = fips_90[fips_90['county'].isnull]
#get_census_data_from_api('https://api.census.gov/data/1990/sf1', fips_90, 'new_census_for_tships_90_total_pop', 1990)
"""
    16 files for 2010 census due to the limitations on # of API calls per hour.
    Hence need to iterate over the files in township_10 folder and concatenate all to the 1st file
"""


def create_final_twnshp_file(twnshp_dir, first_file):
    # Read the initial file
    twnshp_1st_file_df = pd.read_csv(first_file)

    # Change to the twnshp cen dir
    os.chdir(twnshp_dir)
Пример #53
0
import pandas as pd 
import matplotlib.pyplot as plt

dataset = pd.read_csv('/Users/sledro/Desktop/LondonCrime/Datasets with access/London-street.csv')

#Drop columns indexed 0,2,3,7,8,11 as invaluable data
dataset.drop(dataset.columns[[0,2,3,7,8,11]], axis=1, inplace=True)

#Drop NaN's
NaNsRemovedAndColsDropped = dataset.dropna(axis=0, how='any')

#Print first 5 rows
###print(NaNsRemovedAndColsDropped.head())

#Add data frame to json file to allow Firebase upload
#NaNsRemovedAndColsDropped.to_csv('/Users/sledro/Desktop/LondonCrime/Datasets with access/Cleaned.csv')

# https://github.com/firebase/firebase-import
# firebase-import --database_url https://londoncrimepredictor.firebaseio.com/ --path / --json Cleaned.json

#res = pd.read_json('/Users/sledro/Desktop/LondonCrime/Datasets with access/Cleaned.json', orient='records')

#print(res.head())
colors = ['#105B63', '#FFFAD5','#FFD34E','#DB9E36','#BD4932']
plot1 = NaNsRemovedAndColsDropped.groupby('Month').size().reset_index(name='number of outcomes').set_index('Month')
plot1
plot1.plot(kind="line",figsize=(20,10), linestyle='--', marker='o',color=colors)
plt.show()
Пример #54
0
import matplotlib.pyplot as plt
import os
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np


configs_folder = r"/datadrive/configs"
all_exps = os.listdir(configs_folder)

for exp in tqdm(all_exps):
    if os.path.exists(os.path.join(configs_folder, exp, 'training')): #and not os.path.exists(os.path.join(configs_folder, exp, 'progress_graph.png')):
        try:    
            df = pd.read_csv(os.path.join(configs_folder, exp, 'training'))
            if any(np.isinf(df).all()):
                print(f'found column all infs in {exp}')
            elif not all(np.isfinite(df).all()):
                print(f'found column some infs in {exp}')
            fig, axs = plt.subplots(2,1, sharex=True, figsize=(15,10))
            x = list(range(len(df)))
            axs[0].plot(x, df['loss'], label='Training loss')
            axs[0].plot(x, df['val_loss'], label='Validation loss')
            axs[0].legend(prop={'size': 14})
            axs[0].tick_params(axis="x", labelsize=12)
            axs[0].tick_params(axis="y", labelsize=12)
            additional_metric = ''
            if 'dice_coefficient' in df:
                additional_metric = 'dice_coefficient'
            elif 'vod_coefficient' in df:
                additional_metric = 'vod_coefficient'
Пример #55
0
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.normalization import BatchNormalization as bn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import os

try:
    df = pd.read_csv(
        "/home/rishabh/Desktop/DeepLearning/keras/Nucleus/dataset_sin.csv")

    temp = df.as_matrix()
    temp = temp.astype(float)
    scaler = MinMaxScaler(feature_range=(0, 1))
    temp = scaler.fit_transform(temp)
    df = pd.DataFrame(temp)

    def _load_data(data, n_prev=100):
        docX = []
        docY = []
        for i in range(len(data) - n_prev):
            docX.append(data.iloc[i:i + n_prev].as_matrix())
            docY.append(data.iloc[i + n_prev].as_matrix())
        alsX = np.array(docX)
        alsY = np.array(docY)
        return alsX, alsY

    n = 10
Пример #56
0
from sklearn.neighbors import KNeighborsClassifier
import math
import codecs


# hidden layer
rnn_unit = 128
# feature
input_size = 40
output_size = 1
lr = 0.0006
k=4
# csv_file = 'stock3005.csv'
csv_file = 'fof基金20170731-1031.csv'
f = open(csv_file, 'r', encoding=u'utf-8', errors='ignore')
df = pd.read_csv(f)
df.dropna(inplace=True)

def addLayer(inputData, inSize, outSize, activity_function=None):
    Weights = tf.Variable(tf.random_normal([inSize, outSize]))
    basis = tf.Variable(tf.zeros([1, outSize]) + 0.1)
    weights_plus_b = tf.matmul(inputData, Weights) + basis
    if activity_function is None:
        ans = weights_plus_b
    else:
        ans = activity_function(weights_plus_b)
    return ans

x_data = preprocessing.minmax_scale(df.iloc[:, 3:43].values,feature_range=(-1,1))
y_data = preprocessing.minmax_scale(df.iloc[:, 43:44].values,feature_range=(-1,1))
Пример #57
0
def nsw74psid_a(path):
    """A Subset of the nsw74psid1 Data Set

  The `nsw74psidA` data frame has 252 rows and 10 columns. See
  `nsw74psid1` for more information.

  This data frame contains the following columns:

  trt
      a numeric vector

  age
      a numeric vector

  educ
      a numeric vector

  black
      a numeric vector

  hisp
      a numeric vector

  marr
      a numeric vector

  nodeg
      a numeric vector

  re74
      a numeric vector

  re75
      a numeric vector

  re78
      a numeric vector

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `nsw74psid_a.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 252 rows and 10 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'nsw74psid_a.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/DAAG/nsw74psidA.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='nsw74psid_a.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Пример #58
0
import numpy as np
import pandas as pd
import os

print("Imports are ready")

########################################################
### combine shipment_id, phone_id, user_id, order_id ###
########################################################

# First, load the files containing information about shipments
shipments1 = pd.read_csv(
    "./ngwl-predict-customer-churn/shipments/shipments2020-03-01.csv")
shipments2 = pd.read_csv(
    "./ngwl-predict-customer-churn/shipments/shipments2020-01-01.csv")
shipments3 = pd.read_csv(
    "./ngwl-predict-customer-churn/shipments/shipments2020-04-30.csv")
shipments4 = pd.read_csv(
    "./ngwl-predict-customer-churn/shipments/shipments2020-06-29.csv")

# Put all shipments into one table
shipments = pd.concat([shipments1, shipments2, shipments3, shipments4])

# Read addresses and fix the column names
addresses = pd.read_csv("./ngwl-predict-customer-churn/misc/addresses.csv")
addresses.columns = ["ship_address_id", "phone_id"]

# Now create the mapping through shipment_address_id with the addresses to receive phone_id
shipments_and_addresses = pd.merge(addresses, shipments, on="ship_address_id")

# We will take the phone id, user id, shipment id, order id, order state from here
Пример #59
0
import os
import re
import sys
import pandas as pd




df = pd.read_csv("../files/accumulo/train_data3.csv")

buggy = df.loc[df['buggy']==1]
clean = df.loc[df['buggy']==0]



print(buggy.shape)
print(clean.shape)




'''
df = pd.read_csv("../files/accumulo/train_data_test.csv")

print(df.columns.values)

df['vector'] = df['vector'].apply(lambda v : v.replace('\n','').split(' '))
df['vector'] = df['vector'].apply(lambda v : [float(i) for i in v])

buggy_vectors = df.loc[df['buggy']==1]
fixed_vectors = df.loc[df['fixed']==1]
Пример #60
0
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2, f_regression, mutual_info_regression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score
import tpot

train = pd.read_csv('mercedes_train.csv')
y_train = train['y'].values
train.drop(['ID', 'y'], axis=1, inplace=True)
train = pd.get_dummies(train, drop_first=True)
train = train.values

config_dict = {
    'sklearn.linear_model.ElasticNet': {
        'l1_ratio': np.arange(0.05, 1.01, 0.05),
        'alpha': np.linspace(0.001, 10.0, 100),
        'normalize': [True, False]
    },

    # 'sklearn.ensemble.ExtraTreesRegressor': {
    #     'n_estimators': range(50,501,50),
    #     'max_features': np.arange(0.05, 1.01, 0.05),
    #     'min_samples_split': range(2, 21),
    #     'min_samples_leaf': range(1, 21),
    #     'bootstrap': [True, False]
    # },

    # 'sklearn.ensemble.GradientBoostingRegressor': {
    #     'n_estimators': range(75,251,25),
    #     'loss': ["ls", "lad", "huber", "quantile"],