def submission(df, df_test, test_passenger_ids): output, forest = train_decision_tree(df[0::,1::], df[0::,0], df_test) csv = pd.concat([pd.DataFrame(test_passenger_ids), pd.DataFrame({'Survived': output.astype(int)})], axis=1) print forest.feature_importances_ csv.to_csv("submission.csv", index = False)
def removestop(csvpath, stop): csv = pd.read_csv(csvpath) mes = csv['message'] cl = csv['Coding:Level1'] cidx = [] iidx = [] aidx = [] for i in range(len(mes)): # for i in range(1): if not isinstance(mes[i], str): if math.isnan(mes[i]): mes[i] = "" mes[i].strip() m = nl.tokenize.word_tokenize(mes[i]) sentence = [] for w in m: w = w.lower() if w not in stop: sentence.append(w) mes[i] = sentence if cl[i] == 'Information': iidx.append(i) elif cl[i] == 'Community': cidx.append(i) elif cl[i] == 'Action': aidx.append(i) # print(sentence) for i in range(len(cl)): csv['message'] = mes csv.to_csv("data/stopsremoved.csv", index=False) return mes, cl, cidx, iidx, aidx
def add_value_averages(length, mapped_dict, mfd): if 'CSVs' in os.listdir(os.getcwd()): os.chdir('CSVs') for file in os.listdir(os.getcwd()): if file.startswith(length): csv = pd.read_csv(file) for key, value in mapped_dict.items(): word_list = [] for word in value: if word in mfd: word.strip() all_word = word + " (All)" if all_word in csv.columns.values: word_list.append(all_word) elif word in csv.columns.values: word_list.append(word) if word_list: csv[key] = csv[word_list].mean(axis=1) print(key) print(csv[key]) file_name = length + "-MFD-with-averages.csv" csv.to_csv(file_name, index=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument("path", type=str, help="Dataset path (*.csv)") parser.add_argument("destino_path", type=str, help="Path to save the file (*.csv)") args = parser.parse_args() path = args.path destino_path = args.destino_path #read csv csv = pd.read_csv(path) #csv = clean_csv(csv).clean(600,'not', 'not', confi = None) #csv = clean_csv(csv).clean(600,'yes', 'not', confi = None) csv = clean_csv(csv).clean(700,'yes', 'yes',3, confi = 0.65) csv = csv.reset_index(drop=True) #preprocessing each list p1 = Preprocessing() lista_query = p1.preprocess_text(csv['query'].tolist()) lista_response = p1.preprocess_text([str(i) for i in csv['response'].tolist()]) #delete query and response columns csv.drop(['query', 'response'], axis=1, inplace=True) #add query and response columns csv['query'] = lista_query csv['response'] = lista_response csv = csv.reset_index(drop=True) #save csv file csv.to_csv(destino_path + 'dataset_confidence_065__3.csv', index=False)
def saveCSV(self, filename, directory): os.chdir(directory) csv = self.csv[0] csv.to_csv( filename, encoding='utf-8', )
def calculate_rating(line): csv = pd.read_csv('ratings.csv', sep=',') row = csv.loc[line] print(row) curr_rating = (float(row[2]) * 5 + float(row[3]) * 4 + float(row[4]) * 3 + float(row[5]) * 2 + float(row[6]) * 1) / float(row[1]) print(round(curr_rating, 2)) return round(curr_rating, 2) csv.loc[line, 'Rating'] = curr_rating csv.to_csv("ratings.csv", index=False)
def removelinks(csvpath): csv = pd.read_csv(csvpath) mes = csv['message'] for i in range(len(mes)): if "http" in mes[i] or "www" in mes[i]: mes[i] = regex.sub("http\S+", "", mes[i]) mes[i] = regex.sub("www\S+", "", mes[i]) csv['message'] = mes csv.to_csv("data/linksremoved.csv", index=False)
def main(): query = input('Introduce query: ') num_items = check_query(query) action = input( f'The query returned {num_items} results. Do you want to continue? [y/n] ' ) if 'y' in action or 'Y' in action: csv = get_csv(num_items, query) ts = int(time.time()) csv.to_csv(f'sample_data/{ts}.csv') print(f'Data saved to {ts}.csv') else: print('Aborting...')
def process_csv(filename, outfile): csv = pd.read_csv(filename, names=['timeStamp', 'rawData']) csv.drop(csv.index[0], inplace=True) csv.reset_index(inplace=True) csv.drop(['index'], axis=1, inplace=True) csv['timeStamp'] -= csv['timeStamp'][0] csv['rawData'] = csv['rawData'] - float((csv['rawData'].mode())) ## Add endTimeStamp ## # tmp = csv['timeStamp'].shift(-1).fillna(0) # csv['endTimeStamp'] = tmp # csv = csv[:-1] # csv['offset'] = csv['endTimeStamp'] - csv['timeStamp'] csv.to_csv(outfile, sep=',', index=False)
def writeLatLngToCSV(files: dict): # 日付ごとに output # ファイルがなければ追加 # my map では lat lng でプロットするため、それらとタイトルと日付等のメタデータを行にする outputFileName = './output/{0}.csv'.format(date.today()) if not os.path.exists(outputFileName): touch(outputFileName) csv = pd.read_csv(outputFileName, sep='\t', names=header) for _, locations in files.items(): for location in locations: csv = pd.concat( [csv, pd.DataFrame([location], columns=header)], ignore_index=True) csv.to_csv(outputFileName, encoding='"utf-8')
def linkclean(csvpath): features = [] csv = pd.read_csv(csvpath) mes = csv['message'] for i in range(len(mes)): if "http" in mes[i] or "www" in mes[i]: mes[i] = regex.sub("http\S+", "", mes[i]) mes[i] = regex.sub("www\S+", "", mes[i]) features.append(1) else: features.append(0) csv['message'] = mes csv.to_csv("..\data\linksremoved.csv", index=False) return features
def get_by_salary(min, max): uri = "https://ciabhackathon.conductor.com.br:8443/transacoes/data/intervalo/2018-05-01/2018-07-01" resp = requests.get(uri, headers={ 'Content-Type': 'application/json', 'Authorization': 'Token a754f3d7bffaf8abc2570d5c354f8f4015e5487e' }) if resp.status_code != 200: print("Deu errado") else: result = resp.json()['data'] csv = DataFrame(result) export_csv = csv.to_csv(r'file.csv')
def WriteToCSV(csv, row, column, value): csv.at[row, column] = value csv.to_csv(witchFile + '.csv', index=False)
import json import csv import pandas as pd with open('clustered_pools.json') as f: clust = json.loads(f.read()) with open('column.json') as f: ids = json.loads(f.read()) with open('latlon.json') as f: latlon = json.loads(f.read()) if __name__ == '__main__': data = [[0] * 5 for i in range(len(ids))] k = 0 for i in range(len(clust)): for j in clust[i]: data[k][0] = ids[j] data[k][1] = j data[k][2] = latlon[j][0] data[k][3] = latlon[j][1] data[k][4] = i + 1 k = k + 1 csv = pd.DataFrame(data, columns=['Employee', 'Sl.No', 'Lat', 'Lon', 'Pool']) '''with open("clust_analysis.csv", "w") as f: writer = csv.writer(f) writer.writerows(csv)''' csv.to_csv('clust_lsis.csv')
"slug": current["slug"], "name": current["name"], "creator_pseudo": current["creator"]["pseudo"], "categories": "|".join([e["name"] for e in current["categories"]]), "youtube_url": extract_url(current["links"], "youtube"), "twitter_url": extract_url(current["links"], "twitter"), "tip_amount": int(current["parameters"]["tipperAmount"]), "tip_number": int(current["parameters"]["tipperNumber"]), }) with open(FILENAME, "a") as f: writer = csv.DictWriter(f, data[0].keys(), lineterminator="\n") if f.tell() == 0: writer.writeheader() writer.writerows(data) csv = pd.read_csv(FILENAME, parse_dates=["date"]) csv.drop_duplicates(subset=["date", "slug"], keep="last", inplace=True) csv.sort_values(by=["date", "slug"], inplace=True) csv.to_csv(FILENAME, index=False)
except: continue ''' Caso algo saia errado ou a página chegue ao final, o arquivo é gravado ''' print(endereco) print('PAGINA: ', pagina) pagina -= 1 if pagina <= 90: break #listass.to_csv('lista.csv') ''' csv = pd.DataFrame() csv['titulo'] = lista_titulo csv['corpo'] = lista_corpo csv['url'] = lista_url csv.to_csv('el_pais_full.csv') ''' s1 = pd.Series(lista_titulo, name='titulo') s2 = pd.Series(lista_corpo, name='corpo') s3 = pd.Series(lista_url, name='url')
def save_results(scores, true_labels, model, dataset, method, weight, label, random_seed, anomaly_type, anomaly_proportion, step=-1): directory = 'results/{}/{}/{}_{}/{}/w{}/'.format(model, dataset, anomaly_type, anomaly_proportion, method, weight) if not os.path.exists(directory): os.makedirs(directory) print(directory, dataset) if dataset != 'kdd': print("Tets on ", dataset) file_name = str(label) + "_step" + str(step) if anomaly_type == 'novelty': print("NOVELTY") c = 90 if dataset == 'rop': c = 22 else: c = anomaly_proportion * 100 file_name = "{}_step{}_rd{}".format(label, step, random_seed) c = anomaly_proportion * 100 # Highest 5% are anomalous per = np.percentile(scores, 100 - c) fname = directory + "{}.csv".format(label) csv_file = directory + "scores.csv" else: file_name = "kdd_step{}_rd{}".format(step, random_seed) # Highest 20% are anomalous per = np.percentile(scores, 80) fname = directory + "results.csv" csv_file = directory + "scores.csv" scores = np.array(scores) csv = pd.DataFrame() csv['scores'] = scores csv['labels'] = true_labels csv.to_csv(csv_file, index=False) #try: # scores_norm = (scores-min(scores))/(max(scores)-min(scores)) #except: # scores_norm = (scores-scores.min())/(scores.max()-scores.min()) print(max(scores), min(scores)) roc_auc = do_roc(scores, true_labels, file_name=file_name, directory=directory) prc_auc = do_prc(scores, true_labels, file_name=file_name, directory=directory) do_cumdist(scores, file_name=file_name, directory=directory) prg_auc = 0 #do_prg(scores, true_labels, file_name=file_name, directory=directory) ''' plt.close() plt.figure() idx_inliers = true_labels == 0 idx_outliers = true_labels == 1 hrange = (min(scores), max(scores)) plt.hist(scores[idx_inliers], 50, facecolor=(0, 1, 0, 0.5), label="Normal samples", density=True, range=hrange) plt.hist(scores[idx_outliers], 50, facecolor=(1, 0, 0, 0.5), label="Anomalous samples", density=True, range=hrange) plt.title("Distribution of the anomaly score") plt.legend() plt.savefig(directory + 'histogram_{}_{}.png'.format(random_seed, dataset), transparent=True, bbox_inches='tight') ''' y_pred = (scores >= per) precision, recall, f1, _ = precision_recall_fscore_support( true_labels.astype(int), y_pred.astype(int), average='binary') print( "Testing at step %i, method %s: Prec = %.4f | Rec = %.4f | F1 = %.4f" % (step, method, precision, recall, f1)) print( "Testing method {} | ROC AUC = {:.4f} | PRC AUC = {:.4f} | PRG AUC = {:.4f}" .format(method, roc_auc, prc_auc, prg_auc)) results = [ model, dataset, anomaly_type, anomaly_proportion, method, weight, label, step, roc_auc, prc_auc, prg_auc, precision, recall, f1, random_seed, time.ctime() ] save_results_csv("results/results.csv", results, header=0) results = [step, roc_auc, prc_auc, precision, recall, f1, random_seed] save_results_csv(fname, results, header=0)
import pandas as pd with open('split_pools.json') as f: clust=json.loads(f.read()) with open('column.json') as f: ids=json.loads(f.read()) with open('latlon.json') as f: latlon=json.loads(f.read()) if __name__=='__main__': data=[[0]*6 for i in range(len(ids))] k=0 for i in range(len(clust)): c=1 for j in clust[i]: data[k][0]=ids[j] data[k][1]=j data[k][2]=latlon[j][0] data[k][3]=latlon[j][1] data[k][4]=i+1 data[k][5]=c c=c+1 k=k+1 csv=pd.DataFrame(data,columns=['Employee','Sl.No','Lat','Lon','Pool','Order']) '''with open("clust_analysis.csv", "w") as f: writer = csv.writer(f) writer.writerows(csv)''' csv.to_csv('split_lsis.csv')
iter = 1 for i in range(sample_n + 1): random_samples.append(random.sample(regions_list, choose)) for i in random_samples: print(str(iter) + " / " + str(sample_n)) #all_or = regions_dict[i[0]] | regions_dict[i[1]] | regions_dict[i[2]] | regions_dict[i[3]] | regions_dict[i[4]] all_or = regions_dict[i[0]] for j in range(1, choose): all_or = all_or | regions_dict[i[j]] combos["_".join(str(x) for x in list(i))] = all_or.count("1") iter = iter + 1 sorted_combos = sorted(combos.items(), key=operator.itemgetter(1), reverse=True) # print(dict(sorted_combos)) csv = pd.DataFrame(index=range(sample_n + 1), columns=["regions", "coverage"]) csv["regions"] = list(dict(sorted_combos).keys()) csv["coverage"] = list(dict(sorted_combos).values()) # print(csv) plt.figure(figsize=(10, 5)) sns.distplot(csv["coverage"], bins=100, rug=False) plt.savefig("coverage_figures/t" + str(top_num) + "c" + str(choose) + "n" + str(sample_n) + ".png") csv.to_csv("coverage_data/t" + str(top_num) + "c" + str(choose) + "n" + str(sample_n) + ".csv", sep=',')
def addLink(): csv = pd.read_csv('../result/info.csv') print(len(csv)) csv['link'] = csv.apply(lambda x: getLinkNumber(x['idx']), axis=1) csv.to_csv('../result/info1.csv', index=False)
def writeCSV(csv, csvfilepath): csv.to_csv(csvfilepath)
def delIndex(id): csv = pd.read_csv('../result/idcsv/%d.csv' % id, index_col=0) csv.to_csv('../result/idcsv/%d.csv' % id, index=False)
' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', ' ', ' ']) if count == g + 25: break count += 1 pass csv_file.close() csv_file4.close() csv = pd.read_csv(start_from_name, delimiter=';') # csv.columns = ['BEDRIJF_INPUT', 'VESTIGING', 'bagId', 'city', 'country', 'gpslat', # 'gpslon', 'housenumber', 'houseNumberAddition','postalCode', 'rijksdriehoekX', 'rijksdriehoekY', 'rijksdriehoekZ', # 'street', 'type', 'branchNumber', 'isMainSbi', 'sbiCode', 'sbiCodeDescription', 'employees', # 'foundationDate', 'hasCommercialActivities', 'hasEntryInBusinessRegister', 'hasNonMailingIndication', # 'isBranch', 'isLegalPerson', 'isMainBranch', 'kvkNumber', 'legalForm', 'registrationDate', 'rsin', # 'businessName', 'currentStatutoryNames', 'currentTradeNames', 'shortBusinessName', 'fuzzy_match_score', # 'matched_company_name'] start_from_name = '/Users/gielderks/Downloads/Code/Final_excel/profilersinput/ouput_' + start_from + '_till_' + str(count) + '.csv' csv.to_csv(start_from_name, index=False) # pprint.pprint(companies)
def urlID(id): global db csv = pd.read_csv('../result/csv/%d.csv' % id) csv['urlID'] = csv.apply(lambda x: getID(x.url), axis=1) csv.to_csv('../result/idcsv/%d.csv' % id)
def sub_task2_file(): validation = pd.read_csv(PM.task2_validation_csv,header=0,index_col='author_name') txt = pd.read_csv(PM.task2_target,sep='\t',header=0,index_col='authorname') csv = txt.loc[validation.index].fillna(method='bfill') csv.index.name = 'authorname' csv.to_csv(PM.sub_task2,sep='\t',header=1,index=True)
def menu(self): cel1 = Celular.celular() cel2 = Celular.celular() compara = Comparador.comparador() # Criando arquivo xls para escrita dos dados dos handsets workbook = xlsxwriter.Workbook('Dados_Smartphones.xlsx') worksheet = workbook.add_worksheet() # Criação dos metadados da tabela worksheet.write(0, 0, 'Marca') worksheet.write(0, 1, 'Modelo') worksheet.write(0, 2, 'Capacidade da Bateria (mAh)') worksheet.write(0, 3, 'Memória RAM (GB)') worksheet.write(0, 4, 'Memória de Armazenamento (GB)') worksheet.write(0, 5, 'Bluetooth') worksheet.write(0, 6, 'NFC') worksheet.write(0, 7, 'Dual Chip') worksheet.write(0, 8, 'LTE (4G)') worksheet.write(0, 9, 'Resolução da Câmera (Mpx)') worksheet.write(0, 10, 'Peso (g)') worksheet.write(0, 11, 'Dimensões') worksheet.write(0, 12, 'Tamanho da Tela (")') worksheet.write(0, 13, 'Sistema Operacional') worksheet.write(0, 14, 'Versão SO') worksheet.write(0, 15, 'Processamento (GHz)') worksheet.write(0, 16, 'Link fonte') worksheet.write(0, 17, 'Data de atualização') worksheet.write(0, 18, 'Ano do lançamento') worksheet.write(0, 19, 'Preço (R$)') worksheet.write(0, 20, 'Avaliação do Site') worksheet.write(0, 21, 'Avaliação dos Usuários') # Criação do arquivo contento os logs da aplicação log = open('logs.txt', 'w') log.write('Smartphones Description - Getting Database\n\n') log.write('Execution logs:\n\n') log.close() book = xlrd.open_workbook("ListaSmartphones.xls") sh = book.sheet_by_index(0) lista = [] for rx in range(sh.nrows): lista.append(sh.row(rx)) else: for linha, value in enumerate(lista): #print(linha, value) try: # Pegando o conteúdo da lista de smartphones para pesquisa no site aparelho = str(lista[linha + 1]).split("'")[1].upper() kim = Kimovil.kimovil(aparelho) pha = PhoneArena.phoneArena(aparelho) cel1 = kim.executa() cel2 = pha.executa() compara.armazena(cel1, cel2, linha, worksheet) except Exception: print("NOT FOUND") log = open('logs.txt', 'a') log.write( 'A maioria dos dados escolhidos foram do site Kimovil por possuir uma base de dados mais extensa.\n\n' ) log.close() csvfile = "handsets.csv" f = open( csvfile, 'wb') # Abre o arquivo para escrita apagando o conteúdo existente csv = pd.read_excel('Dados_Smartphones.xlsx') csv.to_csv('handsets.csv', index=False) # Código para criação do arquivo zip with ZipFile('gettingDatabase.zip', 'w') as myzip: myzip.write('handsets.csv') myzip.write('logs.txt')
def sortSeg(id): print("sorting %d" % id) csv = pd.read_csv('../result/segg/seg%d.csv' % id) csv.sort_values('times', inplace=True, ascending=False) csv.to_csv("../result/seg/%d.csv" % id, index=False)
except: continue print(endereco) print('PAGINA: ', pagina) pagina-=1 if pagina <= 0: break csv = pd.DataFrame(lista) csv.to_csv('lista.csv') ''' csv = pd.DataFrame() csv['titulo'] = lista_titulo csv['corpo'] = lista_corpo csv['url'] = lista_url csv.to_csv('el_pais_full.csv') s1 = pd.Series(lista_titulo, name='titulo') s2 = pd.Series(lista_corpo, name='corpo') s3 = pd.Series(lista_url, name='url') s1.to_csv('s1_internacional.csv') s2.to_csv('s2_internacional.csv')