def index_queries_by_levels(data, ES_CONN, index_name, QUERIES_DOC_TYPE, fields_enc_level, file_name, create_query_func, run_logs): queries = [] count = 0 run_logs.insert_log('going to start indexing {0} queries'.format(index_name)) if file_name: for row in get_csv_data(data, file_name): queries.append(create_query_func(row, fields_enc_level)) if len(queries) == 1000: response = push_queries_to_percolator(ES_CONN, index_name, QUERIES_DOC_TYPE, queries, run_logs) while not response: run_logs.insert_log('indexing error.. going to index again in 5 sec from {0}'.format(file_name)) time.sleep(5) response = push_queries_to_percolator(ES_CONN, index_name, QUERIES_DOC_TYPE, queries, run_logs) count += 1000 run_logs.insert_log('sucessfully indexed {0} from {1} queries ...'.format(count, file_name)) queries = [] # else: # file_name="DATE" # queries=create_date_querys() if len(queries) > 0: response = push_queries_to_percolator(ES_CONN, index_name, QUERIES_DOC_TYPE, queries, run_logs) while not response: run_logs.insert_log('indexing error.. going to index again in 5 from {0}'.format(file_name)) time.sleep(5) response = push_queries_to_percolator(ES_CONN, index_name, QUERIES_DOC_TYPE, queries, run_logs) run_logs.insert_log('Sucessfully indexed {0} queries from {1}'.format(count + len(queries), file_name))
def main(): input_filename = "./data.csv" output_filename = "validation_logs.txt" sys.stdout = open( output_filename, 'w') # hacky way to redirect print statements to output file records = get_csv_data(input_filename) validate_records(records)
def send(params): data = get_csv_data() for user in data: user_id = str(user['id']) params_id = str(params['id']) if user_id == params_id: for key in user.keys(): user[key] = params[key] break keys = data[0].keys() with open('./data.csv', 'w', newline='') as csv_file: writer = csv.DictWriter(csv_file, keys) writer.writeheader() writer.writerows(data) send_mail(params)
import sys sys.path.append('C:\\Users\\qian\\Desktop\\homework') import pytest from utils import get_csv_data, get_csv_loginname from all_api.topics import Topics test_data_csv = get_csv_data('cases/data.csv') #print(test_data_csv) test_create_csv = get_csv_data('cases/create.csv') test_update_csv = get_csv_data('cases/update.csv') test_loginname_csv = get_csv_loginname('cases/loginname.csv') test_first_comment_csv = get_csv_loginname('cases/firstcomment.csv') @pytest.fixture def get_accesstoken(): #r = request.get('http://39.107.96.138:3000/api/v1',auth = ('user','pass')) # res = r.json() #accesstoken = res['accesstoken'] accesstoken = 'd38a77f0-6f29-45cd-8d49-f72f15b98fd2' return accesstoken #@pytest.mark.skip(reason = '') @pytest.fixture def get_topic_id(get_accesstoken): create_url = '/topics' topics = Topics(create_url) r = topics.post_create_topics( title='11111111111111111111111111111', tab='ask',
"""isblank.py: does a CSV file contain empty values. This script is used to check whether any of the values in a given column of a CSV file are blank. This is useful when working with translations to see whether there are any that have been missed. """ import sys from utils import get_csv_data if __name__ == '__main__': if len(sys.argv) != 3: print "" print "usage: isblank.py <file> <column name>" sys.exit(1) file_path = sys.argv[1] column_name = sys.argv[2] table = get_csv_data(file_path) for idx, row in enumerate(table): if idx == 0: column = row.index(column_name) else: if not row[column].strip(): print "Row %d is blank"
def simulation(): # Assumptions preferences_csv_file = 'preferences.csv' t_matrix_csv_file = 'transition_matrix.csv' num_users = 100000 t_matrix_start_value = .9 num_observations = 3 mean_user_preference = 5 sigma_user_preference = 3 sites_to_destroy = ['KickassTorrents', 'Torrentz'] # Parse CSV data, columns = get_csv_data(preferences_csv_file) hidden_states = columns['Name'] observables = data[0][1:] categories = list(columns.keys()) categories.remove('Name') site_scores = {} for row in data[1:]: site_scores[row[0]] = [int(x) for x in row[1:]] users = generate_users(categories, num_users, mean_user_preference, sigma_user_preference) # Probability of each user starting in each site # Each array is a user, the values are the sites (in order from the csv) state_pi_map = {k: 0 for k in hidden_states} pi_list = [] for user in users: user_sim_prob = similarity_probabilities_with_user( user.preferences, site_scores) for k, v in user_sim_prob.items(): state_pi_map[k] += v pi_list.append(user_sim_prob) state_pi_map = {k: v / num_users for k, v in state_pi_map.items()} print('Probability of a user starting at a site:') print(state_pi_map) # Transition matrix transition_matrix = {} data_t_matrix, columns_t_matrix = get_csv_data(t_matrix_csv_file) data_t_matrix = data_t_matrix[1:] # Set initial values from csv for row in data_t_matrix: transition_matrix[row[0]] = dict( zip(hidden_states, normalize([int(x) for x in row[1:]], 1 - t_matrix_start_value))) # Destroy sites and set self values (i.e. Netflix -> Netflix, Primewire -> Primewire) for key, value in transition_matrix.items(): value[key] = t_matrix_start_value for site_to_destroy in sites_to_destroy: value[site_to_destroy] = 0 # Normalize again to account for changes for key, value in transition_matrix.items(): transition_matrix[key] = dict( zip(hidden_states, normalize(value.values()))) # Populate emission matrix for each user emission_matrix_list = [] for user in users: emission_matrix = {} for h_state in hidden_states: emission_matrix[h_state] = {} for i, observable in enumerate(observables): emission_matrix[h_state][observable] = user.preferences[i] / 10 emission_matrix_list.append(emission_matrix) observations_over_time_list = [] for user in users: observations_over_time_list.append( tuple(select_preferences(categories, user, num_observations))) freq_hidden_states = {k: 0 for k in hidden_states} for i, user in enumerate(users): path, max_prob = viterbi(observations_over_time_list[i], tuple(hidden_states), pi_list[i], transition_matrix, emission_matrix_list[i]) freq_hidden_states[path[-1]] += 1 print('After destroying the sites:') print({k: v / num_users for k, v in freq_hidden_states.items()})
import sys from utils import get_csv_data from csvkit import CSVKitWriter if __name__ == '__main__': if len(sys.argv) != 3: print "" print "usage: create_group.py <file> <file>" sys.exit(1) filein = sys.argv[1] fileout = sys.argv[2] table = get_csv_data(filein) genera = set() ordered = [['genus']] for idx, row in enumerate(table): if idx == 0: column = row.index('scientific_name') else: genus = row[column].split()[0] if genus not in genera: genera.add(genus) ordered.append([genus]) with open(fileout, 'wb') as fp: writer = CSVKitWriter(fp)
import sys from utils import get_csv_data, get_column_values if __name__ == '__main__': if len(sys.argv) != 4: print "" print "usage: issubset.py <file> <file> <column name>" sys.exit(1) subset_path = sys.argv[1] superset_path = sys.argv[2] column_name = sys.argv[3] superset_table = get_csv_data(superset_path) subset_table = get_csv_data(subset_path) superset = get_column_values(column_name, superset_table) subset = get_column_values(column_name, subset_table) print "The first file contains %d distinct values" % len(subset_table) print "The second file contains %d distinct values" % len(superset_table) missing = [] for item in subset: if item not in superset: missing.append(item) if missing:
def get_users(): return get_csv_data()