예제 #1
0
def autotune(input_train, input_validation, output_model, output_parameters,
             metric, k, duration, model_size):
    input_train_path = get_input_path(input_train)
    input_validation_path = get_input_path(input_validation)
    output_model_path = get_output_path(output_model)
    output_parameters_path = get_output_path(output_parameters)

    # Autotune model
    model = fasttext.train_supervised(
        input=input_train_path,
        autotuneValidationFile=input_validation_path,
        autotuneMetric=metric,
        autotuneDuration=duration,
        autotuneModelSize=model_size,
        verbose=VERBOSE)

    # Log best model metrics
    n, p, r = model.test(input_validation_path, k=k)
    print(json.dumps({'n': n, 'precision': p, 'recall': r, 'k': k}))

    # Save best parameters
    with open(output_parameters_path, 'w') as f:
        json.dump(get_model_parameters(model), f)

    # Save best model
    model.save_model(output_model_path)
예제 #2
0
def test(input_test, input_model, output_predictions, k):
    input_test_path = get_input_path(input_test)
    input_model_path = get_input_path(input_model)
    output_predictions_path = get_output_path(output_predictions)

    model = fasttext.load_model(input_model_path)

    # Log model metrics
    n, p, r = model.test(input_test_path, k=k)
    print(json.dumps({'n': n, 'precision': p, 'recall': r, 'k': k}))

    # Split feature and category in a DataFrame
    with open(input_test_path) as f:
        df = pd.DataFrame((split_text(line) for line in f),
                          columns=[TEXT_COLUMN, LABEL_COLUMN])

    # Get predictions
    all_labels, all_probs = model.predict(list(df[TEXT_COLUMN]), k=k)

    # Add formatted predictions
    predictions_df = get_predictions_df(all_labels, all_probs, k)
    df = df.join(predictions_df)

    # Add error column
    df['error'] = (df[f'{LABEL_COLUMN}'] != df[f'{LABEL_COLUMN}@1'])

    # Save predictions
    df.to_csv(output_predictions_path, index=False)
예제 #3
0
def train(input_data, input_parameters, output_model):
    input_data_path = get_input_path(input_data)
    input_parameters_path = get_input_path(input_parameters)
    output_model_path = get_output_path(output_model)

    # Parse parameters
    with open(input_parameters_path) as f:
        parameters = json.load(f)

    # Train model
    model = fasttext.train_supervised(input=input_data_path, **parameters)

    # Save model
    model.save_model(output_model_path)
예제 #4
0
def split(input_data, output_train, output_validation, output_test,
          train_ratio, validation_ratio, test_ratio, shuffle):
    input_data_path = get_input_path(input_data)
    output_train_path = get_output_path(output_train)
    output_validation_path = get_output_path(output_validation)
    output_test_path = get_output_path(output_test)

    with open(input_data_path, 'r') as f:
        data = f.read().strip().split('\n')

    # Shuffle data
    if shuffle:
        print('Shuffling data')
        random.seed(RANDOM_SEED)
        random.shuffle(data)

    # Split train, validation and test data
    validation_index = round(len(data) * train_ratio)
    test_index = round(len(data) * (train_ratio + validation_ratio))
    end_index = round(
        len(data) * (train_ratio + validation_ratio + test_ratio))

    with open(output_train_path, 'w') as f:
        f.write('\n'.join(data[:validation_index]))

    with open(output_validation_path, 'w') as f:
        f.write('\n'.join(data[validation_index:test_index]))

    with open(output_test_path, 'w') as f:
        f.write('\n'.join(data[test_index:end_index]))
예제 #5
0
def predict(input_data, input_model, output_predictions, k):
    input_data_path = get_input_path(input_data)
    input_model_path = get_input_path(input_model)
    output_predictions_path = get_output_path(output_predictions)

    model = fasttext.load_model(input_model_path)

    # Create text DataFrame
    with open(input_data_path) as f:
        df = pd.DataFrame((line for line in f), columns=[TEXT_COLUMN])

    # Get predictions
    all_labels, all_probs = model.predict(list(df[TEXT_COLUMN]), k=k)

    # Add formatted predictions
    predictions_df = get_predictions_df(all_labels, all_probs, k)
    df = df.join(predictions_df)

    # Save predictions
    df.to_csv(output_predictions_path, index=False)
예제 #6
0
def preprocess(input_data, output_data):
    # TODO: make it work also with prediction data without label
    input_data_path = get_input_path(input_data)
    output_data_path = get_output_path(output_data)

    df = pd.read_csv(
        input_data_path,
        engine='python')

    with open(output_data_path, 'w') as output:
        for text, label in zip(df[TEXT_COLUMN], df[LABEL_COLUMN]):
            output.write(f'{process_text(text)} {LABEL_SEPARATOR}{label}\n')
예제 #7
0
def preprocess(input_data, output_data, text_column, label_column, engine):
    # TODO: make it work also with prediction data without label
    input_data_path = get_input_path(input_data)
    output_data_path = get_output_path(output_data)

    df = pd.read_csv(input_data_path, engine=engine).fillna('')

    # Concatenate strings if multiple text columns
    if ',' in text_column:
        df[text_column] = df[text_column.split(',')].agg(' '.join, axis=1)

    with open(output_data_path, 'w') as output:
        for text, label in zip(df[text_column], df[label_column]):
            if not_empty_str(text) and not_empty_str(label):
                output.write(
                    f'{process_text(text)} {LABEL_SEPARATOR}{label}\n')
예제 #8
0
    def __init__(self, dataset_type, article, flavors=None, lazy=True):
        if get_input_path() is None:
            raise Exception('Could not find ECHR datasets.')
            raise NotADirectoryError('Could not find ECHR datasets.')

        self.dataset_type = dataset_type
        self.article = article
        self.flavors = flavors if flavors is not None else get_flavors_list(
        )  # TODO: per article / dataset
        self.lazy = lazy
        self._data = {}
        self._outcomes = None
        self._preprocess = {}

        if not lazy:
            load()
예제 #9
0
from collections import Counter, defaultdict
from itertools import pairwise

from utils import get_input_path

with open(get_input_path(14)) as f:
    lines = f.readlines()

polymer = lines[0].rstrip()
insert_rules = [rule.rstrip().split(" -> ") for rule in lines[2:]]
replace_rules = {
    source: (source[0] + value, value + source[1])
    for source, value in insert_rules
}


def grow_polymer(steps: int) -> int:
    counter = Counter(map(lambda p: "".join(p), pairwise(polymer)))
    for _ in range(steps):
        new_counter = defaultdict[str, int](int)
        for pair, count in counter.items():
            new_counter[replace_rules[pair][0]] += count
            new_counter[replace_rules[pair][1]] += count
        counter = new_counter
    letters = defaultdict[str, int](int)
    for pair, count in counter.items():
        letters[pair[0]] += count
    letters[polymer[-1]] += 1  # last char never gets replaced
    return max(letters.values()) - min(letters.values())

예제 #10
0
from utils import get_input_path

with open(get_input_path(3)) as f:
    lines = f.readlines()

# Part 1
count_ones = [int(bit) for bit in lines[0].rstrip()]
for num in lines[1:]:
    for i in range(len(num.rstrip())):
        count_ones[i] += int(num[i])
gamma = "".join("1" if bit >= len(lines) // 2 else "0" for bit in count_ones)
epsilon = "".join("1" if bit == "0" else "0" for bit in gamma)
gamma = "0b" + gamma
epsilon = "0b" + epsilon
assert int(gamma, 2) * int(epsilon, 2) == 2583164


# Part 2
def get_rating(starting_bit: str, oxygen: bool = True) -> int:
    bit = starting_bit
    bit_ind = 0
    candidates = [num for num in lines if num[bit_ind] == bit]
    while len(candidates) > 1:
        count = 0
        bit_ind += 1
        for candidate in candidates:
            count += int(candidate[bit_ind])
        bit = "1" if count >= len(candidates) / 2 else "0"
        if not oxygen:
            bit = "0" if bit == "1" else "1"
        candidates = [num for num in candidates if num[bit_ind] == bit]
예제 #11
0
        if key in top_down:
            top_down[key] = top_down[key].union([Bag(bag) for bag in values])
        else:
            top_down[key] = {Bag(bag) for bag in values}
    return bottom_up, top_down


def dfs(rules: dict[str, set[Bag]], bag: str) -> int:
    if bag not in rules:
        return 0
    else:
        return sum((1 + dfs(rules, inner_bag.name)) * inner_bag.required_number
                   for inner_bag in rules[bag])


with open(get_input_path(day)) as f:
    rule_lines = f.readlines()
bottom_up, top_down = build_rules(rule_lines)
part1 = set(bottom_up["shiny gold bag"])
queue = deque(part1)
while len(queue) > 0:
    bag = queue.pop()
    if bag not in bottom_up:
        continue
    new_result = part1.union(bottom_up[bag])
    difference = new_result.difference(part1)
    for container in difference:
        queue.append(container)
    part1 = new_result

print(f"Part 1: {len(part1)}")
예제 #12
0
from operations import *
from utils import get_task_dict, save_output_json, get_input_path, get_reference_path

task_dict = get_task_dict(sys.argv[1])
cwd = os.getcwd()

workflow = task_dict.get('input').get('workflow')
metadata = task_dict.get('input').get('metadata')
metadata_service = task_dict.get('input').get('metadata_service')
job_file_template = task_dict.get('input').get('job_file_template')

## TODO: start from job_file_template, download necessary input (and reference) data and populate
## job JSON fields with concrete values
input_path = get_input_path(metadata.get('pipeline'),
                            metadata.get('job_partiption_key'),
                            workflow.get('name'), workflow.get('version'))
reference_path = get_reference_path(metadata.get('pipeline'),
                                    metadata.get('job_partiption_key'),
                                    workflow.get('name'),
                                    workflow.get('version'))

# write out job JSON
job_file = 'job.json'

git_download_url = "%s/archive/%s.zip" % (workflow.get('repo_url'),
                                          workflow.get('version'))
request = requests.get(git_download_url)
zfile = zipfile.ZipFile(BytesIO(request.content))
zfile.extractall(os.getcwd())