def merge_csvs(): keris_data_dir = os.path.join(DATA_DIR, "export_pdfs") ret = [] for filename in os.listdir(keris_data_dir): with open(os.path.join(keris_data_dir, filename)) as f: reader = csv.reader(f) try: for i, row in enumerate(reader): if i == 0: continue year, title, category, context = row context = context.replace("\n", " ") context = re.sub(r"\(cid:\d{1,10}\)", "", context) ret.append( { "year": year, "title": title, "category": category, "context": context, } ) except _csv.Error: pass write_csv( ret, "output", f"keris.csv", )
def xgb(): print("Training an XGB Classifier") params = { "max_depth": 8, "n_estimators": 400, "learning_rate": 0.05, "n_jobs": -1, "subsample": 0.8, "nthread": 4, } trX_, tvX_, trY_, tvY_ = train_test_split(trX, trYi, test_size=0.3) gbm = XGBClassifier(**params) print(gbm.get_xgb_params()) gbm.fit(trX_, trY_, eval_set=[(tvX_, tvY_)], verbose=True) # Find training accuracy trP = classes[gbm.predict(trX)] print("Training Accuracy: ", 100 * accuracy(trY, trP)) # Dump test labels tsP = classes[gbm.predict(tsX)] write_csv("xgb_d5_n150.csv", tsP)
def get_pdf_files(filename): output_string = StringIO() print(f"{filename} read start") p = re.compile(FORMAT_STRING) remain = p.split(filename) date = p.search(filename).group() title = remain[1].split(".pdf")[0] title = title.replace(" ", "", 1).replace("_", "", 1) file = open(filename, "rb") parser = PDFParser(file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) text = str(output_string.getvalue()) text = re.sub(r"\(cid:\d{1,4}\)", "", text) file.close() splited = filename.split("/") write_csv( [ { "year": date.split("-")[0], "title": title, "category": CATEGORY, "context": text, } ], "export_pdfs", f"{splited[len(splited) - 1]}.csv", )
def _export_query(sqlite_db: str, query: str, output_csv: str, header_table: Optional[str] = None): log = getLogger() db_conn: sqlite3.Connection = sqlite3.connect(sqlite_db) col_names = [] try: if header_table: col_names = [_get_col_names(db_conn, header_table)] except (OSError, sqlite3.DatabaseError) as e: log.error( f"Could not dump table column names from DB {sqlite_db}/{header_table}: {e}" ) cursor = db_conn.cursor() try: cursor.execute(query) rows = cursor.fetchall() col_names.extend(rows) write_csv(csv_file=output_csv, rows=col_names) except (OSError, sqlite3.DatabaseError) as e: log.error(f"Could not dump query {query} from DB {sqlite_db}: {e}") cursor.close() db_conn.close()
def main(isochrone: str, place_cache: str, output: str, polygon_step_time_min: int = 7): setup_log() log = getLogger() log.info(f"Reading isochrone map from {isochrone} ...") with codecs.open(isochrone, 'r', 'utf-8-sig') as map_: isochrone_map = json.load(map_) polygons = _build_polygons(isochrone_map) log.info(f"Reading places cache from {place_cache} ...") places = filter(None, map(Place.from_csv_row, read_csv(place_cache))) city_to_time_to_wroclaw: Dict[str, Optional[int]] = {} log.info(f"Finding time to reach destination for places...") for p in places: if p not in city_to_time_to_wroclaw.keys(): index = _index_of_polygon_point_is_in(p.lat, p.lon, polygons) if index != -1: time_to_wroclaw_min = index * polygon_step_time_min city_to_time_to_wroclaw[p.city] = time_to_wroclaw_min else: city_to_time_to_wroclaw[p.city] = None log.info(f"Writing {len(city_to_time_to_wroclaw)} results to {output} ...") write_csv(output, sorted([[k, v] for k, v in city_to_time_to_wroclaw.items()])) log.info("Done")
def logreg(): # Takes ~46 minutes to finish print("Training Logistic Regression") clf = LogisticRegression() clf.fit(trX, trY) print("Training Accuracy:", clf.score(trX, trY)) tsP = clf.predict(tsX) write_csv("logreg.csv", tsP)
def pca_svm_linear(): print(svm_pipeline) print("\nFitting PCA (50) + SVM (Linear)") svm_pipeline.fit(trX, trY) print("Training Accuracy: ", svm_pipeline.score(trX, trY)) tsP = svm_pipeline.predict(tsX) write_csv("pca_50_svm_linear.csv", tsP)
def main(): cfg = load_config() # select device to use if 'device' in cfg: dev_id = find_device_id(cfg['device']) if dev_id is not None: sd.default.device = dev_id if 'sr' in cfg: sr = cfg['sr'] sd.default.samplerate = sr else: sr = sd.default.samplerate # read some settings fft_cfg = cfg.get('fft', {}) fft_len = fft_cfg.get('fft_len', 2**12) f = fft_cfg.get('freq', 1000) repeats = fft_cfg.get('repeats', 1) # get fft frequencies for the used parameters freqs = fft_freqs(fft_len, sr) # select the nearest one f_ind = np.argmin(np.abs(freqs - f)) ams = [] for r in range(repeats): print('round {}/{}'.format(r + 1, repeats)) # run the test fr, am = test_fft(freqs[f_ind], sr, fft_len=fft_len) # normalize #am /= am[f_ind] ams.append(am) am = np.mean(np.array(ams), axis=0) # save results if 'plot_filename' in fft_cfg: plot_frequency_response(fr, am, fft_cfg['plot_filename']) if 'csv_filename' in fft_cfg: write_csv(fft_cfg['csv_filename'], fr, am) # compute thd thd_pct = thd(am, f_ind) thd_db = 20 * np.log10(thd_pct) print('thd: {:.2f} dB'.format(thd_db))
def pca_svm_rbf(): svm_pipeline.set_params(pca__n_components=250) svm_pipeline.set_params(clf__kernel="rbf") print(svm_pipeline) print("\nFitting PCA (250) + SVM (RBF)") svm_pipeline.fit(trX, trY) print("Training Accuracy: ", svm_pipeline.score(trX, trY)) tsP = svm_pipeline.predict(tsX) write_csv("pca_250_svm_rbf.csv", tsP)
def main(): cfg = load_config() # select device to use if 'device' in cfg: dev_id = find_device_id(cfg['device']) if dev_id is not None: sd.default.device = dev_id if 'sr' in cfg: sr = cfg['sr'] sd.default.samplerate = sr else: sr = sd.default.samplerate # get reference amplitude for normalization nf = normalizing_factor(cfg, sr) # debug prints print('using normalizing factor {}'.format(nf)) sweep_cfg = cfg.get('sweep', {}) f0 = sweep_cfg.get('f0', 10) f1 = sweep_cfg.get('f1', 10000) pid = sweep_cfg.get('points_in_decade', 5) repeats = sweep_cfg.get('repeats', 1) freqs = generate_frequency_range(f0, f1, pid) ams = [] for r in range(repeats): ampls = [] print('round {}/{}'.format(r + 1, repeats)) for f in tqdm(freqs): amplitude, rms = test_frequency(f, sr, cfg) ampls.append(amplitude) ampls = np.array(ampls) / nf ams.append(ampls) ampls = np.mean(np.array(ams), axis=0) if 'plot_filename' in sweep_cfg: plot_frequency_response(freqs, ampls, sweep_cfg['plot_filename']) if 'csv_filename' in sweep_cfg: write_csv(sweep_cfg['csv_filename'], freqs, ampls)
def simple_run(split=True): # Network parameters channels = 64 kernel_size = 5 hidden_size = 512 # Training parameters max_epochs = 50 batch_size = 512 learning_rate = 0.001 # Data if split: trX_, tvX_, trY_, tvY_ = train_test_split(trX, trYi, test_size=0.3) trD = DataLoader(Sketches(trX_, trY_), batch_size, shuffle=True) tvD = DataLoader(Sketches(tvX_, tvY_), batch_size, shuffle=False) else: trD = DataLoader(Sketches(trX, trYi), batch_size, shuffle=True) tvD = None # Build the network net = ConvNet(channels, kernel_size, hidden_size) print( "\n", "Hyperparameters:", "max_epochs: ", max_epochs, "learning_rate: ", learning_rate, "batch_size: ", batch_size, "\n", ) print(net) # Train it train(net, trD, tvD, max_epochs, learning_rate) # Turn shuffle off when computing predictions tsD = DataLoader(Sketches(tsX), batch_size, shuffle=False) tsP = classes[predict(net, tsD)] write_csv("conv_net.csv", tsP)
def simple_run(split=True): # Hyper Parameters hidden_size = 1000 max_epochs = 30 learning_rate = 0.0005 batch_size = 100 # Data if split: trX_, tvX_, trY_, tvY_ = train_test_split(trX, trYi, test_size=0.3) trD = DataLoader(Sketches(trX_, trY_), batch_size, shuffle=True) tvD = DataLoader(Sketches(tvX_, tvY_), batch_size, shuffle=False) else: trD = DataLoader(Sketches(trX, trYi), batch_size, shuffle=True) tvD = None # Build the network net = Net(hidden_size) print( "\n", "Hyperparameters:", "hidden_size: ", hidden_size, "max_epochs: ", max_epochs, "learning_rate: ", learning_rate, "batch_size: ", batch_size, "\n", ) print(net) # Train it train(net, trD, tvD, max_epochs, learning_rate) # Turn shuffle off when computing predictions tsD = DataLoader(Sketches(tsX), batch_size, shuffle=False) tsP = classes[predict(net, tsD)] write_csv("neural_net_%d.csv" % hidden_size, tsP)
def part_a(max_iter=300): print() print("Training Kmeans (max_iter=%d)" % max_iter) kmeans = KMeans(n_init=10, n_clusters=20, max_iter=max_iter, random_state=0).fit(trX) labels = cluster_labels(kmeans, trY) # Find training accuracy trP = labels[kmeans.predict(trX)] print("Training Accuracy: ", 100 * accuracy(trY, trP)) # Dump test labels # Test accuracy can only be calculated by uploading to Kaggle tsP = labels[kmeans.predict(tsX)] write_csv("kmeans_%d.csv" % max_iter, tsP)
def generate_synthetic_sent_pair(data_dir="data/"): vocab_size = 100 max_source_len = 30 max_target_len = 30 num_classes = 4 train_size = 2000 test_size = 200 train_dataset = generate_synthetic_sent_pair_dataset( train_size, num_classes, max_source_len, max_target_len, vocab_size) test_dataset = generate_synthetic_sent_pair_dataset( test_size, num_classes, max_source_len, max_target_len, vocab_size) common.write_csv(train_dataset, None, os.path.join(data_dir, "synthetic.train"), delimiter="\t") common.write_csv(test_dataset, None, os.path.join(data_dir, "synthetic.test"), delimiter="\t")
def delete_error_details(): ret = [] with open(os.path.join(DATA_DIR, "output", "edu_details.csv")) as f: reader = csv.reader(f) for idx, row in enumerate(reader): if idx == 0: continue category, year, title, context = row if "Error" not in title: ret.append( { "year": year, "title": title, "category": category, "context": context, } ) write_csv(ret, "output", "edu_details.csv")
def train_keras_cnn(arch_name="keras_alexnet"): batch_size = 128 epochs = 10 x_train, y_train, x_val, y_val, x_test = keras_load_data_split( trX, trYi, tsX) # x_train, y_train, x_test = keras_load_data(trX, trYi, tsX) arch = globals()[arch_name] net = arch() net.compile( loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(), # optimizer=keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True), metrics=['accuracy']) callbacks_list = [ keras.callbacks.EarlyStopping(monitor='val_acc', min_delta=0.001, patience=4, verbose=1, mode='auto') ] net.fit(x_train, y_train, verbose=2, batch_size=batch_size, initial_epoch=0, epochs=epochs, callbacks=callbacks_list, validation_data=(x_val, y_val)) tsP = classes[net.predict_classes(x_test)] write_csv("keras_vgg_13_cnn.csv", tsP)
# "pca_50_svm_linear.csv": 0.69345, # "xgb.csv": 0.62302, } def soft_file_vote(files): # Go over lines in all files at once file_objects = (open("output/" + f) for f in files) weights = files.values() for idx, lines in enumerate(zip(*file_objects)): if not idx: # Skip header row continue labels = map(lambda l: l.strip().split(",")[1], lines) c = Counter() for lbl, wt in zip(labels, weights): c.update({lbl: wt}) label, _ = c.most_common(1)[0] yield label # def correlations(): if __name__ == '__main__': labels = list(soft_file_vote(files)) write_csv(sys.argv[1], labels)
driver.close() return { "year": year, "title": title, "category": CATEGORY, "context": context, } def get_links(): ret = [] with open(os.path.join(DATA_DIR, "output", "edu_in_news_list.csv")) as f: reader = csv.reader(f) for idx, row in enumerate(reader): if idx == 0: continue ret.append([idx, *row]) # year, link = row return ret if __name__ == "__main__": links = get_links() with multiprocessing.Pool(processes=8) as pool: data = pool.map(detail, links) # ret = [] # for item in data: # for record in item: # ret.append(record) write_csv(data, "output", "edu_in_news.csv")
ret = [] cache = {"current_year": 2020} response = requests.get("http://webzine-serii.re.kr?s=미래+교육") soup = BeautifulSoup(response.text, "html.parser") pages = soup.select(".mnmd-pagination__item") maximum = 0 articles = soup.select("h3.post__title.typescale-2") for article in articles: ret.append(get_detail(article, cache)) # print(detail_soup) for page in pages: try: p = int(page.text) maximum = max(p, maximum) except ValueError: pass for page in range(1, maximum): print(f"now {page+1}") url = f"http://webzine-serii.re.kr/page/{page+1}/?s=미래+교육" response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") articles = soup.select("h3.post__title.typescale-2") for article in articles: ret.append(get_detail(article, cache)) return ret if __name__ == "__main__": data = get_all_list() write_csv(data, "output", "seoul_edu.csv")
def save(self, csv_cache): with self._cache_lock: row_gen = [r.to_csv_row() for r in self.cache.values() if r] row_gen.sort(key=lambda r: r[0]) write_csv(csv_cache, row_gen)
"category": CATEGORY, } ) return ret def change_column(): ret = [] with open(os.path.join(DATA_DIR, "output", "edu_blog.csv")) as f: reader = csv.reader(f) for idx, row in enumerate(reader): if idx == 0: continue title, year, context, category = row ret.append( { "year": year, "title": title, "category": category, "context": context, } ) return ret if __name__ == "__main__": data = change_column() # data = get_all_list() write_csv(data, "output", "edu_blog.csv")
def write_twitter_data(data): file_name = generate_file_name(extension="csv") file_path = f"{get_current_folder_path()}/{WRITING_FILES}/{file_name}" write_csv(data, file_path, ("region", "tweet")) return file_name
csv.field_size_limit(sys.maxsize) def merge(): ret = [] files = os.listdir(os.path.join(DATA_DIR, "output")) idx = 1 for file in files: with open(os.path.join(DATA_DIR, "output", file)) as f: reader = csv.reader(f) for i, row in enumerate(reader): if i == 0: continue year, title, category, context = row ret.append({ "index": idx, "year": year, "title": title, "category": category, "context": context, }) idx += 1 print(ret) return ret if __name__ == "__main__": data = merge() write_csv(data, "output", "results.csv")
result_path = os.path.join(DATA_DIR, "output", "results.csv") with open(result_path) as f: reader = csv.reader(f) for i, row in enumerate(reader): if i == 0: continue index, year, title, category, context = row # index = index.replace('"', "") # year = f"{year}" # title = title.replace('"', "") # category = category.replace('"', "") context = context.replace(" ", " ").replace("", " ") temp = { "index": index, "year": year, "title": title, "category": category, "context": context, } for column in columns: if not temp[column].startswith('"'): temp[column] = f'"{temp[column]}' if not temp[column].endswith('"'): temp[column] = f'{temp[column]}"' ret.append(temp) # print(ret[0]) write_csv(ret, "output", "results.csv")