def autotune(input_train, input_validation, output_model, output_parameters, metric, k, duration, model_size): input_train_path = get_input_path(input_train) input_validation_path = get_input_path(input_validation) output_model_path = get_output_path(output_model) output_parameters_path = get_output_path(output_parameters) # Autotune model model = fasttext.train_supervised( input=input_train_path, autotuneValidationFile=input_validation_path, autotuneMetric=metric, autotuneDuration=duration, autotuneModelSize=model_size, verbose=VERBOSE) # Log best model metrics n, p, r = model.test(input_validation_path, k=k) print(json.dumps({'n': n, 'precision': p, 'recall': r, 'k': k})) # Save best parameters with open(output_parameters_path, 'w') as f: json.dump(get_model_parameters(model), f) # Save best model model.save_model(output_model_path)
def test(input_test, input_model, output_predictions, k): input_test_path = get_input_path(input_test) input_model_path = get_input_path(input_model) output_predictions_path = get_output_path(output_predictions) model = fasttext.load_model(input_model_path) # Log model metrics n, p, r = model.test(input_test_path, k=k) print(json.dumps({'n': n, 'precision': p, 'recall': r, 'k': k})) # Split feature and category in a DataFrame with open(input_test_path) as f: df = pd.DataFrame((split_text(line) for line in f), columns=[TEXT_COLUMN, LABEL_COLUMN]) # Get predictions all_labels, all_probs = model.predict(list(df[TEXT_COLUMN]), k=k) # Add formatted predictions predictions_df = get_predictions_df(all_labels, all_probs, k) df = df.join(predictions_df) # Add error column df['error'] = (df[f'{LABEL_COLUMN}'] != df[f'{LABEL_COLUMN}@1']) # Save predictions df.to_csv(output_predictions_path, index=False)
def train(input_data, input_parameters, output_model): input_data_path = get_input_path(input_data) input_parameters_path = get_input_path(input_parameters) output_model_path = get_output_path(output_model) # Parse parameters with open(input_parameters_path) as f: parameters = json.load(f) # Train model model = fasttext.train_supervised(input=input_data_path, **parameters) # Save model model.save_model(output_model_path)
def split(input_data, output_train, output_validation, output_test, train_ratio, validation_ratio, test_ratio, shuffle): input_data_path = get_input_path(input_data) output_train_path = get_output_path(output_train) output_validation_path = get_output_path(output_validation) output_test_path = get_output_path(output_test) with open(input_data_path, 'r') as f: data = f.read().strip().split('\n') # Shuffle data if shuffle: print('Shuffling data') random.seed(RANDOM_SEED) random.shuffle(data) # Split train, validation and test data validation_index = round(len(data) * train_ratio) test_index = round(len(data) * (train_ratio + validation_ratio)) end_index = round( len(data) * (train_ratio + validation_ratio + test_ratio)) with open(output_train_path, 'w') as f: f.write('\n'.join(data[:validation_index])) with open(output_validation_path, 'w') as f: f.write('\n'.join(data[validation_index:test_index])) with open(output_test_path, 'w') as f: f.write('\n'.join(data[test_index:end_index]))
def predict(input_data, input_model, output_predictions, k): input_data_path = get_input_path(input_data) input_model_path = get_input_path(input_model) output_predictions_path = get_output_path(output_predictions) model = fasttext.load_model(input_model_path) # Create text DataFrame with open(input_data_path) as f: df = pd.DataFrame((line for line in f), columns=[TEXT_COLUMN]) # Get predictions all_labels, all_probs = model.predict(list(df[TEXT_COLUMN]), k=k) # Add formatted predictions predictions_df = get_predictions_df(all_labels, all_probs, k) df = df.join(predictions_df) # Save predictions df.to_csv(output_predictions_path, index=False)
def preprocess(input_data, output_data): # TODO: make it work also with prediction data without label input_data_path = get_input_path(input_data) output_data_path = get_output_path(output_data) df = pd.read_csv( input_data_path, engine='python') with open(output_data_path, 'w') as output: for text, label in zip(df[TEXT_COLUMN], df[LABEL_COLUMN]): output.write(f'{process_text(text)} {LABEL_SEPARATOR}{label}\n')
def preprocess(input_data, output_data, text_column, label_column, engine): # TODO: make it work also with prediction data without label input_data_path = get_input_path(input_data) output_data_path = get_output_path(output_data) df = pd.read_csv(input_data_path, engine=engine).fillna('') # Concatenate strings if multiple text columns if ',' in text_column: df[text_column] = df[text_column.split(',')].agg(' '.join, axis=1) with open(output_data_path, 'w') as output: for text, label in zip(df[text_column], df[label_column]): if not_empty_str(text) and not_empty_str(label): output.write( f'{process_text(text)} {LABEL_SEPARATOR}{label}\n')
def __init__(self, dataset_type, article, flavors=None, lazy=True): if get_input_path() is None: raise Exception('Could not find ECHR datasets.') raise NotADirectoryError('Could not find ECHR datasets.') self.dataset_type = dataset_type self.article = article self.flavors = flavors if flavors is not None else get_flavors_list( ) # TODO: per article / dataset self.lazy = lazy self._data = {} self._outcomes = None self._preprocess = {} if not lazy: load()
from collections import Counter, defaultdict from itertools import pairwise from utils import get_input_path with open(get_input_path(14)) as f: lines = f.readlines() polymer = lines[0].rstrip() insert_rules = [rule.rstrip().split(" -> ") for rule in lines[2:]] replace_rules = { source: (source[0] + value, value + source[1]) for source, value in insert_rules } def grow_polymer(steps: int) -> int: counter = Counter(map(lambda p: "".join(p), pairwise(polymer))) for _ in range(steps): new_counter = defaultdict[str, int](int) for pair, count in counter.items(): new_counter[replace_rules[pair][0]] += count new_counter[replace_rules[pair][1]] += count counter = new_counter letters = defaultdict[str, int](int) for pair, count in counter.items(): letters[pair[0]] += count letters[polymer[-1]] += 1 # last char never gets replaced return max(letters.values()) - min(letters.values())
from utils import get_input_path with open(get_input_path(3)) as f: lines = f.readlines() # Part 1 count_ones = [int(bit) for bit in lines[0].rstrip()] for num in lines[1:]: for i in range(len(num.rstrip())): count_ones[i] += int(num[i]) gamma = "".join("1" if bit >= len(lines) // 2 else "0" for bit in count_ones) epsilon = "".join("1" if bit == "0" else "0" for bit in gamma) gamma = "0b" + gamma epsilon = "0b" + epsilon assert int(gamma, 2) * int(epsilon, 2) == 2583164 # Part 2 def get_rating(starting_bit: str, oxygen: bool = True) -> int: bit = starting_bit bit_ind = 0 candidates = [num for num in lines if num[bit_ind] == bit] while len(candidates) > 1: count = 0 bit_ind += 1 for candidate in candidates: count += int(candidate[bit_ind]) bit = "1" if count >= len(candidates) / 2 else "0" if not oxygen: bit = "0" if bit == "1" else "1" candidates = [num for num in candidates if num[bit_ind] == bit]
if key in top_down: top_down[key] = top_down[key].union([Bag(bag) for bag in values]) else: top_down[key] = {Bag(bag) for bag in values} return bottom_up, top_down def dfs(rules: dict[str, set[Bag]], bag: str) -> int: if bag not in rules: return 0 else: return sum((1 + dfs(rules, inner_bag.name)) * inner_bag.required_number for inner_bag in rules[bag]) with open(get_input_path(day)) as f: rule_lines = f.readlines() bottom_up, top_down = build_rules(rule_lines) part1 = set(bottom_up["shiny gold bag"]) queue = deque(part1) while len(queue) > 0: bag = queue.pop() if bag not in bottom_up: continue new_result = part1.union(bottom_up[bag]) difference = new_result.difference(part1) for container in difference: queue.append(container) part1 = new_result print(f"Part 1: {len(part1)}")
from operations import * from utils import get_task_dict, save_output_json, get_input_path, get_reference_path task_dict = get_task_dict(sys.argv[1]) cwd = os.getcwd() workflow = task_dict.get('input').get('workflow') metadata = task_dict.get('input').get('metadata') metadata_service = task_dict.get('input').get('metadata_service') job_file_template = task_dict.get('input').get('job_file_template') ## TODO: start from job_file_template, download necessary input (and reference) data and populate ## job JSON fields with concrete values input_path = get_input_path(metadata.get('pipeline'), metadata.get('job_partiption_key'), workflow.get('name'), workflow.get('version')) reference_path = get_reference_path(metadata.get('pipeline'), metadata.get('job_partiption_key'), workflow.get('name'), workflow.get('version')) # write out job JSON job_file = 'job.json' git_download_url = "%s/archive/%s.zip" % (workflow.get('repo_url'), workflow.get('version')) request = requests.get(git_download_url) zfile = zipfile.ZipFile(BytesIO(request.content)) zfile.extractall(os.getcwd())