def test_remove_unused_categories_in_df(self): df = turn_columns_to_categorical( pd.DataFrame({ 'month': [4, 4, 4], 'day': [5, 6, 7], 'day_of_week': [1, 2, 3] }), ['month', 'day']) assert_array_equal(df.day.cat.categories.values, [5, 6, 7]) df = filter_df(df, 'day', 5) assert_array_equal(df.day.cat.categories.values, [5, 6, 7]) df = remove_unused_categories_in_df(df) assert_array_equal(df.day.cat.categories.values, [5])
def test_filter_df(self): assert_frame_equal( filter_df( pd.DataFrame([4, 4, 4, 5, 5, 5, 6, 6, 6], columns=['month']), 'month', 4), pd.DataFrame({'month': [4, 4, 4]}))
if __name__ == "__main__": tweets_data_path = 'tweets_by_country/' # _translation out = {} all_files = get_all_files(Config.data_path + tweets_data_path, extension='csv') for country in Config.country_prefix: # ['de_', 'fr_', 'nl_']: df = pd.DataFrame() news_files = list(filter(lambda x: country in x, all_files)) for file in news_files: data = pd.read_csv(file, names=Config.colnames, usecols=Config.usecols_list) data.dropna(axis=0, how='any', inplace=True) df = df.append(data, ignore_index=True) text_df = filter_df(Config.keywords, df) # translated_df = translation(text_df) cleaned_df = preprocess(df=text_df) # df= translated_df extreme_pos_count, total_pos, total_neu, total_neg, extreme_neg_count = extreme_vader_sentiment( text_df) out[country.replace('_', '')] = extreme_result(extreme_pos_count, total_pos, total_neu, total_neg, extreme_neg_count) save_to_disk(data=out, path=Config.reports_path, filename='all_extreme_sentiment_summary_country.json')
for start in range(0, int((mfcc.shape[1] - window_size) / stride) + 1): chunk.append(mfcc[:, start * stride:(start * stride + window_size)]) chunk_labels.append(label) return (chunk, chunk_labels) if __name__ == '__main__': """ Classify Audio """ args = get_args() if args.load_cache: df = pd.read_csv(args.source_csv) df = filter_df(df) factor = pd.factorize(df['native_language']) df['native_language'] = factor[0] language_set = factor[1].values del factor train_X, test_X, train_Y, test_Y = split_data(df, test_split=0.2) train_Y, test_Y = train_Y.values, test_Y.values train_size, test_size = len(train_X), len(test_X) pickle.dump([train_Y, test_Y], open('train_labels.dump', 'wb')) logging.basicConfig( level=(logging.DEBUG if args.debug else logging.INFO)) logger = logging.getLogger(__name__) logger.debug(" Training set size = {}".format(train_size)) logger.debug(" Testing set size = {}".format(test_size))
def process_election(election_date): service = build('sheets', 'v4', credentials=creds) dfs = [] requests_payload = [] sheet_id = 0 file_name = 'results_pct' snake_date = election_date[0:4] + '_' + \ election_date[4:6] + '_' + election_date[6:] url = f'https://dl.ncsbe.gov/ENRS/{snake_date}/{file_name}_{election_date}.zip' print(url) data = utils.get_zipfile(url, f'{file_name}_{election_date}.txt') raw_results_df = pd.read_csv(data, delimiter='\t') raw_results_df = utils.filter_df(raw_results_df) raw_results_df = raw_results_df.sort_values(['Contest Name', 'Choice']) grid_coordinate = utils.grid_for_sheet(sheet_id) request_payload = utils.payload_for_file(raw_results_df, grid_coordinate) # requests_payload.append(request_payload) filtered_precinct_df = raw_results_df.copy() precinct_blacklist = ['TRANS', 'ONE', 'OS', 'CURB', 'PROVI', 'ABSEN'] for item in precinct_blacklist: filtered_precinct_df = filtered_precinct_df[~filtered_precinct_df['Precinct'].str.contains( item)] grouped = filtered_precinct_df.groupby( ['Contest Name', 'Precinct']).sum().reset_index() grouped = grouped.dropna() precinct_counts = grouped.groupby('Contest Name').count() precinct_reported_counts = grouped[grouped['Total Votes'] > 0].groupby('Contest Name').count() precinct_df = precinct_counts.join( precinct_reported_counts, rsuffix='reported') precinct_df = pd.DataFrame( {'precincts_reported_perc': precinct_df['Precinctreported'] / precinct_df['Precinct']}, index=precinct_df.index) sheet_id = 1496596366 url = f'https://er.ncsbe.gov/enr/{election_date}/data/results_0.txt' print(url) resp = requests.get(url) candidate_df = pd.read_json(resp.content) candidate_df = candidate_df.drop(['cid', 'vfr', 'gid', 'lid', 'dtx', 'prt', 'ptl', 'col', 'ogl', 'ref'], axis=1) candidate_df = candidate_df.rename(columns={'cnm': 'Race', 'bnm': 'Candidate', 'pty': 'Party', 'vct': 'Total Votes', 'pct': 'Percent of Vote', 'evc': 'Election Day Vote Count', 'avc': 'Absentee Vote Count', 'ovc': 'One-Stop Vote Count', 'pvc': 'Provisional Vote Count'}) filtered_df = utils.filter_df(candidate_df) filtered_df = filtered_df.sort_values(['Race', 'Candidate']) grid_coordinate = utils.grid_for_sheet(sheet_id) request_payload = utils.payload_for_file(filtered_df, grid_coordinate) # requests_payload.append(request_payload) filtered_df = filtered_df.loc[filtered_df.groupby( ['Race', 'Candidate'])['Total Votes'].idxmax().dropna()] joined = utils.build_joined_df(filtered_df, precinct_df) sheet_id = 2103006474 grid_coordinate = utils.grid_for_sheet(sheet_id) request_payload = utils.payload_for_file(joined, grid_coordinate) requests_payload.append(request_payload) update_payload = {'requests': requests_payload} sheet = service.spreadsheets() request = service.spreadsheets().batchUpdate( spreadsheetId=SPREADSHEET_ID, body=update_payload) response = request.execute()