Пример #1
0
    def classify_vote(self, prompt, legislator):
        # retrieve the ID of the legislator
        legislators = pd.load_csv('./house_table.csv')
        leg_id = legislators[legislators['name'] == legislator].iloc[0]['id']

        # if the legislator doesn't exist, TODO: Do something
        # if legislators.empty:

        # get a set of bills which the legislator voted on
        vote_history = pd.load_csv('./data/vote_table.csv')
        bill_df = vote_history.loc[vote_history['person'] == leg_id]

        # load the embeddings of all the bills the legislator voted on
        yeas = set()
        nays = set()
        for index, row in bill_df.iterrows():
            # add embeddings to the right set
            tensor = torch.load('data/tensors/' + row['bill'])
            if row['vote'] == 1:
                yeas.add(tensor)
            else:
                nays.add(tensor)

        # use embedder to turn the prompt into embeddings
        prompt_embed = self.embedder.get_embeddings(prompt, willSave=False)
Пример #2
0
def load_data(path):
    if path == None:
        path == opt.data_path
    if not os.path.isfile(path):
        data = get_info("all")
        data.to_csv(path)
    return pd.load_csv(path)
Пример #3
0
    def __init__(self, file_path, **kwargs):

        self.file_path = file_path
        self.kwarguments = {'sep': '\t', 'encoding': 'cp1252'}
        self.kwarguments.update(kwargs)

        self.df = pd.load_csv(file_path, **self.kwarguments)
Пример #4
0
 def get_stop_words(self):
     print('   -> Getting stop word list...')
     file = 'stopwords_list.csv'
     stop_words_list = []
     if os.path.isfile(self.data_path + file):
         print('     -> Stop Words File is found')
         # dm = DataManager()
         df = pd.load_csv(self.data_path + file, encoding='utf-8')
         stop_words_list = df['Stopwords'].tolist()
     else:
         print('     -> Stop Words File is not found')
     return stop_words_list
Пример #5
0
    def pre_prosseccing(self):
        # dm = DataManager()
        data = pd.load_csv(self.data_path + self.data_file_name + '.csv',
                           encoding='utf-8')
        data = self.get_requirements_from_document(data)
        # description_reset = data.dropna(axis=0).reset_index(drop=True)
        description = data[self.factor]
        description_reset = description.dropna(axis=0).reset_index(drop=True)

        description = [sent.replace('\n', ' ') for sent in description_reset]

        with open(self.data_path + self.data_file_name + '_tm.documents',
                  'wb') as f:
            pickle.dump(description, f)
        # # 수정된 job_title에서 posting_id 가지고 오기
        # posting_ids = data['posting_id']
        # posting_list = posting_ids.to_list()
        #
        # # posting_id에 따라 description_data set 만들기
        # des_data = [data['job_description'][id] for id in posting_ids]
        # title_data = [data['job_title'][id] for id in posting_ids]
        # id_list = [i for i in range(len(posting_list))]
        # df = pd.DataFrame({'id': posting_list, 'job_title': title_data, 'job_description': des_data, 'posting_id':posting_list})
        # df.to_csv('data/doc2vec_test_data/0702/merge_0629_adj.csv', mode='w', encoding='utf-8')

        # 수정된 description set 불러와 데이터 전처리 수행
        # data = dm.load_csv(file='data/doc2vec_test_data/0702/merge_0629_adj.csv', encoding='utf-8')
        sentences = self.data_text_cleansing(data)

        data_words = list(self.sent_to_words(sentences))

        data_words_nostops = self.remove_stopwords(data_words)

        bigram = self.make_ngram(data_words_nostops, n=2)
        data_lemmatized = self.lematization(bigram)

        # bigram = self.make_bigram(data_words_nostops)
        # data_lemmatized = self.lematization(bigram)
        # for i in range(len(bigram)):
        #     print(f'[{i}] : {bigram[i]}')

        data_lemmatized_filter = self.word_filtering(data_lemmatized)
        # data_lemmatized_filter = data_lemmatized
        for i in range(len(data_lemmatized_filter)):
            print(f'[{i}] : {data_lemmatized_filter[i]}')
        # uniquewords = self.make_unique_words(data_lemmatized)
        with open(self.data_path + self.data_file_name + '.corpus', 'wb') as f:
            pickle.dump(data_lemmatized_filter, f)

        self.get_word_count(data_lemmatized_filter)

        print('=== end preprocessing ===')
        return data['id'], data_lemmatized_filter
def load_and_plot_data(filename):
    """Load a data frame and plot each column.

    Args:
      filename (str): Path to a CSV file of data.

    Returns:
      pandas.DataFrame
    """
    df = pd.load_csv(filename, index_col=0)
    df.hist()
    return df
Пример #7
0
 def get_including_words(self):
     print('    -> Getting including word list...')
     file = 'including_words_list.csv'
     including_words_list = []
     if os.path.isfile(self.data_path + file):
         print('     -> Including Words File is found')
         # dm = DataManager()
         df = pd.load_csv(self.data_path + file, encoding='utf-8')
         including_words_list = df['Includingwords'].tolist()
     else:
         print('     -> Including Words File is not found')
     print(including_words_list)
     return including_words_list
Пример #8
0
def main():
    # load iris data
    iris_df = pd.load_csv('../datasets/iris.csv', Header=None).drop(0, 1)
    iris_train_X = iris_df.iloc[0:100, [0, 2]].values
    iris_train_target = iris_df.iloc[0:100, 4].values
    iris_train_y = np.where(iris_train_target == 'setosa', -1, 1)

    iris_test_X = iris_df.iloc[50:, [0, 2]].values
    iris_test_target = iris_df.iloc[50:, 4].values
    iris_test_y = np.where(iris_test_target == 'setosa', -1, 1)
    
    pnn = Perceptron(0.01, 10)
    print("Training perceptron with examples... ")
    print("Examples shape " + iris_train_X.shape + " " + iris_train_y.shape + "\n")
    pnn.fit(iris_train_X, iris_train_y)
Пример #9
0
def extractData(csv_path, json_path=''):
    # extract data from csv file
    df = pd.load_csv(csv_path)
    row, column = df.shape
    dict4json = dict()

    # # Open this part to get information for info.py
    # # original data
    # sex = df.sex
    # age = df.age_approx
    # location = df.anatom_site_general_challenge
    # label = df.target

    # # convert to our form
    # sex = set(sex)
    # age = age.unique().astype(np.int32)
    # location = set(location)
    # label = set(label)

    sex = base.sex()
    age = base.age()
    location = base.location()

    for idx in range(row):
        line = df.iloc[idx]
        name = line.image_name
        if line.sex not in sex:
            line.sex = 'Not sure'
        this_sex = int(np.argwhere(line.sex))
        this_age = int(np.argwhere(line.age_approx.astype(np.int32)))
        if line.location not in location:
            line.location = 'empty'
        this_location = int(np.argwhere(line.location))
        dict4json[name] = {
            'sex': this_sex,
            'age': this_age,
            'location': this_location,
            'label': label
        }

    # # Open this part if need to dump the label to json file
    if json_path:
        import json
        with open(json_path, 'w') as f:
            json.dump(dict4json, f)

        label = int(line.label)
    return dict4json
Пример #10
0
def get_synthetic_data(data_path, batch_size=64):
    """ Synthetic distribution data
    Args:
        train_batch_size(int): training batch size 
        test_batch_size(int): test batch size
    Returns:
        (torch.utils.data.DataLoader): train loader 
        (torch.utils.data.DataLoader): test loader
    """

    training = pd.load_csv(data_path).values

    # pick your indices for sample 1 and sample 2:
    s1 = np.random.choice(range(training.shape[0]),
                          int(0.9 * training.shape[0]),
                          replace=False)
    s2 = list(set(range(training.shape[0])) - set(s1))
    # extract your samples:
    train_data = torch.as_tensor(training[s1, :-1])
    y_train_data = torch.astensor(training[s1, -1])
    test_data = torch.as_tensor(training[s2, :-1])
    y_test_data = torch.astensor(training[s2, -1])

    # one hot
    n = 11
    y_train_data_one_hot = torch.nn.functional.one_hot(
        y_train_data.to(torch.int64), n)
    y_test_data_one_hot = torch.nn.functional.one_hot(
        y_test_data.to(torch.int64), n)

    # create dataset and dataloaders
    train_dataset = torch.utils.data.TensorDataset(train_data,
                                                   y_train_data_one_hot)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64)

    test_dataset = torch.utils.data.TensorDataset(test_data,
                                                  y_test_data_one_hot)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

    return train_loader, test_loader
Пример #11
0
    def get_including_words(self):
        print('    -> Getting including word list...')
        # dm = DataManager()
        # tfidf_file = self.data_file_name+'_tf_idf.csv'
        # tf_idf_df = dm.load_csv(file=self.data_path+tfidf_file, encoding='utf-8')
        # tf_idf_sum = tf_idf_df.iloc[-1:]
        # tf_idf_sum = tf_idf_sum.transpose()
        # print(tf_idf_sum)
        # print(tf_idf_sum.iloc[1])

        file = self.data_file_name + '_includings_list.csv'
        print(file)
        including_words_list = []
        if os.path.isfile(self.data_path + file):
            print('     -> Including Words File is found')

            df = pd.load_csv(self.data_path + file, encoding='utf-8')
            including_words_list = df['Includingwords'].tolist()
        else:
            print('     -> Including Words File is not found')
        print(including_words_list)
        return including_words_list
Пример #12
0
def main():
	org_df = pd.load_original_movies_df()
	df = pd.load_csv(DATA_DIR + 'ranking.csv')
	
	movie_features = org_df.columns
	reviewer_features = [column for column in df.columns if column not in movie_features]
	
	# for model training I want to find the most frequent reviewer and then try and
	# predict his movie likes/dislikes. Then using the parameters I will have a model
	# for a new reviewer
	# df[df.isnull().sum() < 500].columns
	
	# get the most frequent reviewer
	reviewer = None
	most_reviews = 0
	for column in reviewer_features:
		if df[column].sum(axis=1) > most_reviews:
			most_reviews = df[column].sum(axis=1)
			reviewer = column
	
	# drop all movies that they didn't review
	df.dropna(axis=0, subset=[reviewer,], inplace=True)
Пример #13
0
def evaluate_file(filename):
    df = pd.load_csv(filename)
    evaluate(df)
Пример #14
0
import numpy as np
import pandas as pd
import var
import collections
import math
import time
import os
import sys
from sklearn import preprocessing

df = pd.load_csv('Train.csv')

######################### To Do ####################
# Scale Data
# Split data into different categories of features, 2 pandas
# Identify all features into different cases


def discreteFeatures(dictionary, df, title):
    for key, val in dictionary.items():
        col = 'title' + val
        df.loc[:, col] = df.where(df.loc[:, title] == key, 1, 0)
    return df

def scaleFeatures(df):
    scaler = preprocessing.StandardScaler()
    scaler.fit(df)
    scaler.transform(X_train)
    return scaler

def
import pandas as pd
import csv
import numpy as np


file = open('feeds.csv', 'rb')
data = pd.load_csv(file)
file.close()

#for cleaning NaN valuues/ 0 values
university_towns = []
with open('Datasets/university_towns.txt') as file:
    for line in file:
        if 'NaN' in line:
            # Remember this `state` until the next is found
            state = line
        else:
            # Otherwise, we have a city; keep `state` as last-seen
            university_towns.append((state, line))

#replcing Nan
from numpy import NaN
frame.replace({NaN:0.00})
Пример #16
0
# coding: utf-8
import pandas as dp
import pandas as pd
pd.load_csv('results/meds-ces.csv')
pd.read_csv('results/meds-ces.csv')
ces = pd.read_csv('results/meds-ces.csv')
extant_ciel = pd.read_csv('input/ciel-in-concepts-dict.csv')
extant_ciel
any(extant_ciel[1])
extant_ciel[1]
extant_ciel[0]
extant_ciel.columns.values
extant_ciel = pd.read_csv('input/ciel-in-concepts-dict.csv')
extant_ciel.columns.values
extant_ciel["voided"]
list(extant_ciel["voided"])
any(list(extant_ciel["voided"]))
extant_ciel = extant_ciel[extant_ciel["voided"] == 0]
extant_ciel
ces = ces[ces["concept"].starts_with("CIEL")]
ces = ces[ces.concept.starts_with("CIEL")]
ces = ces[ces.concept.startswith("CIEL")]
ces = ces[ces.concept.str.startswith("CIEL")]
ces = ces[!ces["concept"].isna())
ces = ces[~ces["concept"].isna())
ces = ces[~ces["concept"].isna()]
ces = ces[ces.concept.startswith("CIEL")]
ces = ces[ces.concept.str.startswith("CIEL")]
ces
ces_not_in_dict = ces[ces.concept.str.replace('CIEL:', '') not in extant_ciel]
ces_not_in_dict = ces[~ces.concept.str.replace('CIEL:', '').isin(extant_ciel["ID"])]
Пример #17
0
import numpy as np
import pandas as pd

search_radius = 100

data = pd.load_csv('sh2.MP4.csv')

num_frames = np.max(data['Frame'].values) + 1

track_counter = 0
current_tracks = {}
completed_tracks = {}

for k, row in data[data['Frame'] == 0].iterrows():
    key = 'track{:05d}'.format(track_counter)
    track_counter += 1
    current_tracks[key] = [
        (0, (row['Column'], row['Row'])),
    ]

for time_step in range(1, num_frames):
    num_active = len(current_tracks.keys())
    now = np.zeros(num_active, 2)
    prior = np.zeros(num_active, 2)
    key_list = current_tracks.keys()
    for index, key in enumerate(key_list):
        now[index, :] = current_tracks[key][-1][1]
        if len(current_tracks[key]) > 1:
            prior[index, :] = current_tracks[key][-2][1]
        else:
            prior[index, :] = now[index, :]
Пример #18
0
# Calculate the rolling of 21 days volatility of any Darwin
import pandas as pd
df = pd.load_csv('LSV___XXX___.csv', index_col=0)
df.index = pd.to_datetime(df.index, unit='ms')
pd.resample('D').last().rolling(21).quote.std().dropna()
Пример #19
0
def divvy_crime_data(grid, data_fname = DATA_SAMPLE_FILENAME):
    pd.load_csv()
#coding: UTF-8
import pandas as pd
import numpy as np

files = input().split()

for file in files:
    pd.load_csv()
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()


#REGRESSION WITH BOSTON HOUSE DATASET

#first import and load dataset
boston = pd.load_csv('boston.csv')

#display the recent data
print(boston.head())

#now create feature and Target
X = boston.drop('MEDV',axis=1).Values
y = boston['MEDV'].values

#predict house from a single feature column no. 5
X_room = X[:,5]

#you can check the type of the data
type(X_rooms), type(y) #numpy arrays

#reshape
Пример #22
0
1. whole_value (which is the array of the whole word embedding) so I can convert it to a matrix
2. Change GRU's loading direction adapted to persian
"""

import tensorflow as tf
import pandas as pd
import numpy as np

# ─── 0 INPUT PRE-PROCESSING ─────────────────────────────────────────────────────────
"""word embedding algorithm"""
#i will be using twitter hashtags

# __ VARIABLE __
E = [] #vector E, consists of e, which are computed one-hots times L, L is the sentence?
#load data
data = pd.load_csv("twitter_hashtag.csv")
#load each row of sentence
for row in data.row:
    #parse the tweet, have the number of words in it
    #shape the sentence, word embeddings?!
    
    # compute one-hot vector for each word in the sentence
    for word in data['words']: #words column consists of the sentence word in a list!
        data['one-hot'] = word.compute_one_hot()
        E.append(data['one-hot'] * L) #what's L? how to multiply it?

    #processing the aspect and taking into account Va (embedding of aspect's vector)
    if len(data['aspect'] == 1):
        #take its e, as Va
        V_a = E[data['aspect_loc']]
    else: #if aspect is more than one word
Пример #23
0
 def data_from_csv(self, filepath):
     """ load the dataframe using pandas lib """
     self.dataframe = pd.load_csv(filepath, separator='')
Пример #24
0
 def load_csv(self, fpath, *args, **kwargs):
     df = pd.load_csv(fpath, *args, **kwargs)
     return df
Пример #25
0
 def read_elo_file(self):
     return pd.load_csv(self.file_path + self.file_name)
Пример #26
0
 def _load_data_and_answers(self) -> Tuple[List[str], List[str]]:
     return pd.load_csv(self.TEST_FILE_PATH), pd.load_csv(
         self.TEST_ANSWERS_FILE_PATH)
Пример #27
0
                   "headers, signatures, and quoting.")

(opts, args) = op.parse_args()
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print(__doc__)
op.print_help()
print()


###############################################################################
# Load some categories from the training set

data = pd.load_csv('./data/Train_rev1.csv')
if opts.all_categories:
    categories = None
else:
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]

if opts.filtered:
    remove = ('headers', 'footers', 'quotes')
else:
    remove = ()
Пример #28
0
#!/usr/bin/env python3
import pandas
from sklearn import dataset



if __name__ == '__main__':
  dataframe = pandas.load_csv('iris.csv')
Пример #29
0
 def load_by_pandas(filepath, **kwargs):
     sha1 = digests.sha1(filepath)
     identifier = "sha1:" + sha1
     import pandas
     frame = pandas.load_csv(filepath, **kwargs)
     return CommaSeparatedValue(identifier, frame)