示例#1
0
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neural_network import MLPClassifier

# new helpers:
from shared import dataset_local_path, bootstrap_accuracy, simple_boxplot, TODO

# stdlib:
from dataclasses import dataclass
import json
from typing import Dict, Any, List

#%% load up the data
examples = []
ys = []

with open(dataset_local_path("poetry_id.jsonl")) as fp:
    for line in fp:
        info = json.loads(line)
        # Note: the data contains a whole bunch of extra stuff; we just want numeric features for now.
        keep = info["features"]
        # whether or not it's poetry is our label.
        ys.append(info["poetry"])
        # hold onto this single dictionary.
        examples.append(keep)

## CONVERT TO MATRIX:

feature_numbering = DictVectorizer(sort=True)
X = feature_numbering.fit_transform(examples)

print("Features as {} matrix.".format(X.shape))
示例#2
0
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import typing as T
import re
import numpy as np
from dataclasses import dataclass

from shared import bootstrap_accuracy, bootstrap_auc, dataset_local_path, simple_boxplot

RAND = 123456
random.seed(RAND)

# Using 'pandas' to load data now:
df: pd.DataFrame = pd.read_json(dataset_local_path("lit-wiki-2020.jsonl.gz"),
                                lines=True)

# Regular expresssions to grab parts of the text:
WORDS = re.compile(r"(\w+)")
NUMBERS = re.compile(r"(\d+)")


def extract_features(row):
    """
    Given the title and body of a Wikipedia article,
    extract features that might be of use to the 'is literary' task.

    Return named features in a dictionary.
    """
    title = row["title"].lower()
    simple_boxplot,
)

# stdlib:
from dataclasses import dataclass, field
import json, gzip
from typing import Dict, List

#%% load up the data
# Try 'POETRY'
dataset = "WIKI"
examples: List[str] = []
ys: List[bool] = []

if dataset == "WIKI":
    with gzip.open(dataset_local_path("lit-wiki-2020.jsonl.gz"), "rt") as fp:
        for line in fp:
            info = json.loads(line)
            # Note: the data contains a whole bunch of extra stuff; we just want numeric features for now.
            keep = info["body"]
            # whether or not it's poetry is our label.
            ys.append(info["truth_value"])
            # hold onto this single dictionary.
            examples.append(keep)
else:
    # take only one per book!
    by_book = {}
    with open(dataset_local_path("poetry_id.jsonl")) as fp:
        for line in fp:
            info = json.loads(line)
            # dictionary keeps this key unique:
示例#4
0
from shared import dataset_local_path
import pandas as pd
import numpy as np
from typing import Tuple
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
import os, gzip
from tqdm import tqdm

clickbait = pd.read_csv(dataset_local_path("clickbait.csv.gz"))

glove = {}
with gzip.open(os.environ["HOME"] + "/data/glove.6B.50d.txt.gz", "rt") as vecf:
    for line in tqdm(vecf, total=400000):
        split = line.index(" ")
        word = line[:split]
        vector = np.fromstring(line[split + 1:], dtype=np.float32, sep=" ")
        glove[word] = vector
        if word == "the":
            print(word, vector)

print(clickbait.head())
# skip citation

df = clickbait.iloc[1:]

RANDOM_SEED = 12345

tv_f, test_f = train_test_split(df, test_size=0.25, random_state=RANDOM_SEED)
train_f, vali_f = train_test_split(tv_f,
示例#5
0
#%%
import pandas as pd
import numpy as np
import typing as T
import re
import numpy as np
from tqdm import tqdm
from dataclasses import dataclass

from shared import bootstrap_auc, dataset_local_path, simple_boxplot

df: pd.DataFrame = pd.read_json(
    dataset_local_path("lit-wiki-2020.jsonl.gz"), lines=True
)

# Debug loading:
# df.head()


# Regular expresssions to grab parts of the text:
WORDS = re.compile(r"(\w+)")
NUMBERS = re.compile(r"(\d+)")


def extract_features(row):
    """
    Given the title and body of a Wikipedia article,
    extract features that might be of use to the 'is literary' task.

    Return named features in a dictionary.
    """
示例#6
0
"""
Problem 1: We have a copy of Wikipedia (I spared you the other 6 million pages).
It is separate from our labels we collected.
"""


@dataclass
class JustWikiPage:
    title: str
    wiki_id: str
    body: str


# Load our pages into this pages list.
pages: List[JustWikiPage] = []
with gzip.open(dataset_local_path("tiny-wiki.jsonl.gz"), "rt") as fp:
    for line in fp:
        entry = json.loads(line)
        pages.append(JustWikiPage(**entry))


@dataclass
class JustWikiLabel:
    wiki_id: str
    is_literary: bool


# Load our judgments/labels/truths/ys into this labels list:
labels: List[JustWikiLabel] = []
with open(dataset_local_path("tiny-wiki-labels.jsonl")) as fp:
    for line in fp:
示例#7
0
    dataset_local_path,
    bootstrap_r2,
    simple_boxplot,
)
import random
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# start off by seeding random number generators:
RANDOM_SEED = 12345
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Load the AirQualityUCI Dataset:
df = pd.read_csv(dataset_local_path("AirQualityUCI.csv"), sep=";", decimal=",")
print(df.shape)
# drop empty columns:
df = df.dropna(how="all", axis="columns")
print(df.shape)

PREDICT_COL = "CO(GT)"

# select only the rows where our 'y' is present:
df = df[df[PREDICT_COL] > -200.0]
print(df.shape)

# delete Date/Time columns
df.pop("Date")
df.pop("Time")
示例#8
0
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from scipy.spatial.distance import euclidean
from typing import List, Tuple
from tqdm import tqdm

import csv

from shared import dataset_local_path

ys = []
examples = []

with open(dataset_local_path("AirQualityUCI.csv")) as fp:
    # This is a CSV file where the separators are not commas!
    rows = csv.reader(fp, delimiter=";")
    header = next(rows)
    for row in rows:
        datapoint = {}
        # {'Date': '10/03/2004', 'Time': '18.00.00',
        #  'CO(GT)': '2,6', 'PT08.S1(CO)': '1360', 'NMHC(GT)': '150', 'C6H6(GT)': '11,9',
        #  'PT08.S2(NMHC)': '1046', 'NOx(GT)': '166', 'PT08.S3(NOx)': '1056',
        #  'NO2(GT)': '113', 'PT08.S4(NO2)': '1692', 'PT08.S5(O3)': '1268',
        #  'T': '13,6', 'RH': '48,9', 'AH': '0,7578', '': ''}
        date = None
        time = None
        for (column_name, column_value) in zip(header, row):
            if column_value == "" or column_name == "":
                continue
            elif column_name == "Date":
示例#9
0
import numpy as np
from shared import (
    dataset_local_path, )
from typing import Tuple, Dict
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# start off by seeding random number generators:
RANDOM_SEED = 12345

df: pd.DataFrame = pd.read_json(dataset_local_path("poetry_id.jsonl"),
                                lines=True)

features = pd.json_normalize(df.features)
features = features.join([df.poetry, df.words])

tv_f, test_f = train_test_split(features,
                                test_size=0.25,
                                random_state=RANDOM_SEED)
train_f, vali_f = train_test_split(tv_f,
                                   test_size=0.25,
                                   random_state=RANDOM_SEED)

textual = TfidfVectorizer(max_df=0.75, min_df=2, dtype=np.float32)
numeric = make_pipeline(DictVectorizer(sparse=False), StandardScaler())