Пример #1
0
def merge_csvs():
    keris_data_dir = os.path.join(DATA_DIR, "export_pdfs")
    ret = []
    for filename in os.listdir(keris_data_dir):
        with open(os.path.join(keris_data_dir, filename)) as f:
            reader = csv.reader(f)
            try:
                for i, row in enumerate(reader):
                    if i == 0:
                        continue
                    year, title, category, context = row
                    context = context.replace("\n", " ")
                    context = re.sub(r"\(cid:\d{1,10}\)", "", context)
                    ret.append(
                        {
                            "year": year,
                            "title": title,
                            "category": category,
                            "context": context,
                        }
                    )
            except _csv.Error:
                pass
    write_csv(
        ret, "output", f"keris.csv",
    )
Пример #2
0
def xgb():
    print("Training an XGB Classifier")

    params = {
        "max_depth": 8,
        "n_estimators": 400,
        "learning_rate": 0.05,
        "n_jobs": -1,
        "subsample": 0.8,
        "nthread": 4,
    }

    trX_, tvX_, trY_, tvY_ = train_test_split(trX, trYi, test_size=0.3)

    gbm = XGBClassifier(**params)
    print(gbm.get_xgb_params())

    gbm.fit(trX_, trY_, eval_set=[(tvX_, tvY_)], verbose=True)

    # Find training accuracy
    trP = classes[gbm.predict(trX)]
    print("Training Accuracy: ", 100 * accuracy(trY, trP))

    # Dump test labels
    tsP = classes[gbm.predict(tsX)]
    write_csv("xgb_d5_n150.csv", tsP)
Пример #3
0
def get_pdf_files(filename):
    output_string = StringIO()
    print(f"{filename} read start")
    p = re.compile(FORMAT_STRING)
    remain = p.split(filename)
    date = p.search(filename).group()
    title = remain[1].split(".pdf")[0]
    title = title.replace(" ", "", 1).replace("_", "", 1)
    file = open(filename, "rb")
    parser = PDFParser(file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
    text = str(output_string.getvalue())

    text = re.sub(r"\(cid:\d{1,4}\)", "", text)
    file.close()
    splited = filename.split("/")
    write_csv(
        [
            {
                "year": date.split("-")[0],
                "title": title,
                "category": CATEGORY,
                "context": text,
            }
        ],
        "export_pdfs",
        f"{splited[len(splited) - 1]}.csv",
    )
Пример #4
0
def _export_query(sqlite_db: str,
                  query: str,
                  output_csv: str,
                  header_table: Optional[str] = None):
    log = getLogger()
    db_conn: sqlite3.Connection = sqlite3.connect(sqlite_db)

    col_names = []
    try:
        if header_table:
            col_names = [_get_col_names(db_conn, header_table)]
    except (OSError, sqlite3.DatabaseError) as e:
        log.error(
            f"Could not dump table column names from DB {sqlite_db}/{header_table}: {e}"
        )

    cursor = db_conn.cursor()
    try:
        cursor.execute(query)
        rows = cursor.fetchall()
        col_names.extend(rows)
        write_csv(csv_file=output_csv, rows=col_names)
    except (OSError, sqlite3.DatabaseError) as e:
        log.error(f"Could not dump query {query} from DB {sqlite_db}: {e}")
        cursor.close()
        db_conn.close()
Пример #5
0
def main(isochrone: str,
         place_cache: str,
         output: str,
         polygon_step_time_min: int = 7):
    setup_log()
    log = getLogger()

    log.info(f"Reading isochrone map from {isochrone} ...")
    with codecs.open(isochrone, 'r', 'utf-8-sig') as map_:
        isochrone_map = json.load(map_)
    polygons = _build_polygons(isochrone_map)

    log.info(f"Reading places cache from {place_cache} ...")
    places = filter(None, map(Place.from_csv_row, read_csv(place_cache)))
    city_to_time_to_wroclaw: Dict[str, Optional[int]] = {}

    log.info(f"Finding time to reach destination for places...")
    for p in places:
        if p not in city_to_time_to_wroclaw.keys():
            index = _index_of_polygon_point_is_in(p.lat, p.lon, polygons)
            if index != -1:
                time_to_wroclaw_min = index * polygon_step_time_min
                city_to_time_to_wroclaw[p.city] = time_to_wroclaw_min
            else:
                city_to_time_to_wroclaw[p.city] = None

    log.info(f"Writing {len(city_to_time_to_wroclaw)} results to {output} ...")
    write_csv(output,
              sorted([[k, v] for k, v in city_to_time_to_wroclaw.items()]))
    log.info("Done")
Пример #6
0
def logreg():
    # Takes ~46 minutes to finish
    print("Training Logistic Regression")
    clf = LogisticRegression()
    clf.fit(trX, trY)

    print("Training Accuracy:", clf.score(trX, trY))

    tsP = clf.predict(tsX)
    write_csv("logreg.csv", tsP)
Пример #7
0
def pca_svm_linear():

    print(svm_pipeline)

    print("\nFitting PCA (50) + SVM (Linear)")
    svm_pipeline.fit(trX, trY)

    print("Training Accuracy: ", svm_pipeline.score(trX, trY))

    tsP = svm_pipeline.predict(tsX)
    write_csv("pca_50_svm_linear.csv", tsP)
Пример #8
0
def main():
    cfg = load_config()

    # select device to use
    if 'device' in cfg:
        dev_id = find_device_id(cfg['device'])
        if dev_id is not None:
            sd.default.device = dev_id

    if 'sr' in cfg:
        sr = cfg['sr']
        sd.default.samplerate = sr
    else:
        sr = sd.default.samplerate

    # read some settings
    fft_cfg = cfg.get('fft', {})

    fft_len = fft_cfg.get('fft_len', 2**12)
    f = fft_cfg.get('freq', 1000)
    repeats = fft_cfg.get('repeats', 1)

    # get fft frequencies for the used parameters
    freqs = fft_freqs(fft_len, sr)

    # select the nearest one
    f_ind = np.argmin(np.abs(freqs - f))

    ams = []

    for r in range(repeats):
        print('round {}/{}'.format(r + 1, repeats))

        # run the test
        fr, am = test_fft(freqs[f_ind], sr, fft_len=fft_len)

        # normalize
        #am /= am[f_ind]

        ams.append(am)

    am = np.mean(np.array(ams), axis=0)

    # save results
    if 'plot_filename' in fft_cfg:
        plot_frequency_response(fr, am, fft_cfg['plot_filename'])

    if 'csv_filename' in fft_cfg:
        write_csv(fft_cfg['csv_filename'], fr, am)

    # compute thd
    thd_pct = thd(am, f_ind)
    thd_db = 20 * np.log10(thd_pct)
    print('thd: {:.2f} dB'.format(thd_db))
Пример #9
0
def pca_svm_rbf():

    svm_pipeline.set_params(pca__n_components=250)
    svm_pipeline.set_params(clf__kernel="rbf")

    print(svm_pipeline)

    print("\nFitting PCA (250) + SVM (RBF)")
    svm_pipeline.fit(trX, trY)

    print("Training Accuracy: ", svm_pipeline.score(trX, trY))

    tsP = svm_pipeline.predict(tsX)
    write_csv("pca_250_svm_rbf.csv", tsP)
Пример #10
0
def main():
    cfg = load_config()

    # select device to use
    if 'device' in cfg:
        dev_id = find_device_id(cfg['device'])
        if dev_id is not None:
            sd.default.device = dev_id

    if 'sr' in cfg:
        sr = cfg['sr']
        sd.default.samplerate = sr
    else:
        sr = sd.default.samplerate

    # get reference amplitude for normalization
    nf = normalizing_factor(cfg, sr)

    # debug prints
    print('using normalizing factor {}'.format(nf))

    sweep_cfg = cfg.get('sweep', {})
    f0 = sweep_cfg.get('f0', 10)
    f1 = sweep_cfg.get('f1', 10000)
    pid = sweep_cfg.get('points_in_decade', 5)
    repeats = sweep_cfg.get('repeats', 1)

    freqs = generate_frequency_range(f0, f1, pid)
    ams = []

    for r in range(repeats):
        ampls = []
        print('round {}/{}'.format(r + 1, repeats))

        for f in tqdm(freqs):
            amplitude, rms = test_frequency(f, sr, cfg)
            ampls.append(amplitude)

        ampls = np.array(ampls) / nf

        ams.append(ampls)

    ampls = np.mean(np.array(ams), axis=0)

    if 'plot_filename' in sweep_cfg:
        plot_frequency_response(freqs, ampls, sweep_cfg['plot_filename'])

    if 'csv_filename' in sweep_cfg:
        write_csv(sweep_cfg['csv_filename'], freqs, ampls)
Пример #11
0
def simple_run(split=True):

    # Network parameters
    channels = 64
    kernel_size = 5
    hidden_size = 512

    # Training parameters
    max_epochs = 50
    batch_size = 512
    learning_rate = 0.001

    # Data
    if split:
        trX_, tvX_, trY_, tvY_ = train_test_split(trX, trYi, test_size=0.3)
        trD = DataLoader(Sketches(trX_, trY_), batch_size, shuffle=True)
        tvD = DataLoader(Sketches(tvX_, tvY_), batch_size, shuffle=False)
    else:
        trD = DataLoader(Sketches(trX, trYi), batch_size, shuffle=True)
        tvD = None

    # Build the network
    net = ConvNet(channels, kernel_size, hidden_size)

    print(
        "\n",
        "Hyperparameters:",
        "max_epochs: ",
        max_epochs,
        "learning_rate: ",
        learning_rate,
        "batch_size: ",
        batch_size,
        "\n",
    )
    print(net)

    # Train it
    train(net, trD, tvD, max_epochs, learning_rate)

    # Turn shuffle off when computing predictions
    tsD = DataLoader(Sketches(tsX), batch_size, shuffle=False)
    tsP = classes[predict(net, tsD)]
    write_csv("conv_net.csv", tsP)
Пример #12
0
def simple_run(split=True):

    # Hyper Parameters
    hidden_size = 1000
    max_epochs = 30
    learning_rate = 0.0005

    batch_size = 100

    # Data
    if split:
        trX_, tvX_, trY_, tvY_ = train_test_split(trX, trYi, test_size=0.3)
        trD = DataLoader(Sketches(trX_, trY_), batch_size, shuffle=True)
        tvD = DataLoader(Sketches(tvX_, tvY_), batch_size, shuffle=False)
    else:
        trD = DataLoader(Sketches(trX, trYi), batch_size, shuffle=True)
        tvD = None

    # Build the network
    net = Net(hidden_size)

    print(
        "\n",
        "Hyperparameters:",
        "hidden_size: ",
        hidden_size,
        "max_epochs: ",
        max_epochs,
        "learning_rate: ",
        learning_rate,
        "batch_size: ",
        batch_size,
        "\n",
    )

    print(net)

    # Train it
    train(net, trD, tvD, max_epochs, learning_rate)

    # Turn shuffle off when computing predictions
    tsD = DataLoader(Sketches(tsX), batch_size, shuffle=False)
    tsP = classes[predict(net, tsD)]
    write_csv("neural_net_%d.csv" % hidden_size, tsP)
Пример #13
0
def part_a(max_iter=300):

    print()
    print("Training Kmeans (max_iter=%d)" % max_iter)

    kmeans = KMeans(n_init=10,
                    n_clusters=20,
                    max_iter=max_iter,
                    random_state=0).fit(trX)
    labels = cluster_labels(kmeans, trY)

    # Find training accuracy
    trP = labels[kmeans.predict(trX)]
    print("Training Accuracy: ", 100 * accuracy(trY, trP))

    # Dump test labels
    # Test accuracy can only be calculated by uploading to Kaggle
    tsP = labels[kmeans.predict(tsX)]
    write_csv("kmeans_%d.csv" % max_iter, tsP)
Пример #14
0
def generate_synthetic_sent_pair(data_dir="data/"):
    vocab_size = 100
    max_source_len = 30
    max_target_len = 30
    num_classes = 4
    train_size = 2000
    test_size = 200
    train_dataset = generate_synthetic_sent_pair_dataset(
        train_size, num_classes, max_source_len, max_target_len, vocab_size)
    test_dataset = generate_synthetic_sent_pair_dataset(
        test_size, num_classes, max_source_len, max_target_len, vocab_size)
    common.write_csv(train_dataset,
                     None,
                     os.path.join(data_dir, "synthetic.train"),
                     delimiter="\t")
    common.write_csv(test_dataset,
                     None,
                     os.path.join(data_dir, "synthetic.test"),
                     delimiter="\t")
Пример #15
0
def delete_error_details():
    ret = []
    with open(os.path.join(DATA_DIR, "output", "edu_details.csv")) as f:
        reader = csv.reader(f)
        for idx, row in enumerate(reader):
            if idx == 0:
                continue
            category, year, title, context = row

            if "Error" not in title:
                ret.append(
                    {
                        "year": year,
                        "title": title,
                        "category": category,
                        "context": context,
                    }
                )
    write_csv(ret, "output", "edu_details.csv")
Пример #16
0
def train_keras_cnn(arch_name="keras_alexnet"):

    batch_size = 128
    epochs = 10

    x_train, y_train, x_val, y_val, x_test = keras_load_data_split(
        trX, trYi, tsX)
    # x_train, y_train, x_test = keras_load_data(trX, trYi, tsX)

    arch = globals()[arch_name]

    net = arch()

    net.compile(
        loss=keras.losses.categorical_crossentropy,
        optimizer=keras.optimizers.Adam(),
        # optimizer=keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True),
        metrics=['accuracy'])

    callbacks_list = [
        keras.callbacks.EarlyStopping(monitor='val_acc',
                                      min_delta=0.001,
                                      patience=4,
                                      verbose=1,
                                      mode='auto')
    ]

    net.fit(x_train,
            y_train,
            verbose=2,
            batch_size=batch_size,
            initial_epoch=0,
            epochs=epochs,
            callbacks=callbacks_list,
            validation_data=(x_val, y_val))

    tsP = classes[net.predict_classes(x_test)]
    write_csv("keras_vgg_13_cnn.csv", tsP)
Пример #17
0
    # "pca_50_svm_linear.csv": 0.69345,
    # "xgb.csv": 0.62302,
}


def soft_file_vote(files):
    # Go over lines in all files at once
    file_objects = (open("output/" + f) for f in files)
    weights = files.values()

    for idx, lines in enumerate(zip(*file_objects)):
        if not idx:  # Skip header row
            continue

        labels = map(lambda l: l.strip().split(",")[1], lines)

        c = Counter()
        for lbl, wt in zip(labels, weights):
            c.update({lbl: wt})

        label, _ = c.most_common(1)[0]

        yield label


# def correlations():

if __name__ == '__main__':
    labels = list(soft_file_vote(files))
    write_csv(sys.argv[1], labels)
Пример #18
0
    driver.close()
    return {
        "year": year,
        "title": title,
        "category": CATEGORY,
        "context": context,
    }


def get_links():
    ret = []
    with open(os.path.join(DATA_DIR, "output", "edu_in_news_list.csv")) as f:
        reader = csv.reader(f)
        for idx, row in enumerate(reader):
            if idx == 0:
                continue
            ret.append([idx, *row])
            # year, link = row
    return ret


if __name__ == "__main__":
    links = get_links()
    with multiprocessing.Pool(processes=8) as pool:
        data = pool.map(detail, links)
    # ret = []
    # for item in data:
    #     for record in item:
    #         ret.append(record)
    write_csv(data, "output", "edu_in_news.csv")
Пример #19
0
    ret = []
    cache = {"current_year": 2020}
    response = requests.get("http://webzine-serii.re.kr?s=미래+교육")
    soup = BeautifulSoup(response.text, "html.parser")
    pages = soup.select(".mnmd-pagination__item")
    maximum = 0
    articles = soup.select("h3.post__title.typescale-2")
    for article in articles:
        ret.append(get_detail(article, cache))
        # print(detail_soup)
    for page in pages:
        try:
            p = int(page.text)
            maximum = max(p, maximum)
        except ValueError:
            pass
    for page in range(1, maximum):
        print(f"now {page+1}")
        url = f"http://webzine-serii.re.kr/page/{page+1}/?s=미래+교육"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.select("h3.post__title.typescale-2")
        for article in articles:
            ret.append(get_detail(article, cache))
    return ret


if __name__ == "__main__":
    data = get_all_list()
    write_csv(data, "output", "seoul_edu.csv")
Пример #20
0
 def save(self, csv_cache):
     with self._cache_lock:
         row_gen = [r.to_csv_row() for r in self.cache.values() if r]
         row_gen.sort(key=lambda r: r[0])
         write_csv(csv_cache, row_gen)
Пример #21
0
                "category": CATEGORY,
            }
        )

    return ret


def change_column():
    ret = []
    with open(os.path.join(DATA_DIR, "output", "edu_blog.csv")) as f:
        reader = csv.reader(f)
        for idx, row in enumerate(reader):
            if idx == 0:
                continue
            title, year, context, category = row
            ret.append(
                {
                    "year": year,
                    "title": title,
                    "category": category,
                    "context": context,
                }
            )
    return ret


if __name__ == "__main__":
    data = change_column()
    # data = get_all_list()
    write_csv(data, "output", "edu_blog.csv")
Пример #22
0
def write_twitter_data(data):
    file_name = generate_file_name(extension="csv")
    file_path = f"{get_current_folder_path()}/{WRITING_FILES}/{file_name}"
    write_csv(data, file_path, ("region", "tweet"))
    return file_name
Пример #23
0
csv.field_size_limit(sys.maxsize)


def merge():
    ret = []
    files = os.listdir(os.path.join(DATA_DIR, "output"))
    idx = 1
    for file in files:
        with open(os.path.join(DATA_DIR, "output", file)) as f:
            reader = csv.reader(f)
            for i, row in enumerate(reader):
                if i == 0:
                    continue
                year, title, category, context = row
                ret.append({
                    "index": idx,
                    "year": year,
                    "title": title,
                    "category": category,
                    "context": context,
                })
                idx += 1
    print(ret)
    return ret


if __name__ == "__main__":
    data = merge()
    write_csv(data, "output", "results.csv")
Пример #24
0
    result_path = os.path.join(DATA_DIR, "output", "results.csv")
    with open(result_path) as f:
        reader = csv.reader(f)
        for i, row in enumerate(reader):
            if i == 0:
                continue

            index, year, title, category, context = row
            # index = index.replace('"', "")
            # year = f"{year}"
            # title = title.replace('"', "")
            # category = category.replace('"', "")
            context = context.replace(" ", " ").replace("​", " ")
            temp = {
                "index": index,
                "year": year,
                "title": title,
                "category": category,
                "context": context,
            }
            for column in columns:
                if not temp[column].startswith('"'):
                    temp[column] = f'"{temp[column]}'
                if not temp[column].endswith('"'):
                    temp[column] = f'{temp[column]}"'

            ret.append(temp)

    # print(ret[0])
    write_csv(ret, "output", "results.csv")