def main(): problem_dataset_dir = os.path.join('Problems', 'Problem6') solution_dir = os.path.join("Problems", "Problem6Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] genome = case[0] k = case[1][0] l = case[1][1][0] t = case[1][1][1] dna = DNA(genome) clumps_patterns = dna.get_clumps_patterns(int(k), int(t), int(l)) if clumps_patterns.sort() != case_output.sort(): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + clumps_patterns) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() case = test_cases[test_i] genome = case[0] k = case[1][0] l = case[1][1][0] t = case[1][1][1] dna = DNA(genome) clumps_patterns = dna.get_clumps_patterns(int(k), int(t), int(l)) usage.end() writer.write_data(test_i, clumps_patterns, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + genome + "\n" + str(k) + " " + str(l) + " " + str(t)) print("\n\nOutput") print("=====") for clump in clumps_patterns: print(clump) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem2') solution_dir = os.path.join("Problems", "Problem2Solution") data_reader = DataReader(problem_dataset_dir) training_data, testing_data = data_reader.get_data() codons_table = data_reader.get_rna_codon_table() for sample in training_data: dna_string = sample[0][0] amino_acid = sample[0][1] output = sample[1] dna = DNA(dna_string) dna.set_codon_table(codons_table) candidates = dna.get_dna_to_amino_acid_candidates(amino_acid) if set(candidates) != set(output): raise Exception("Output not matched!\nExpecting: " + str(output) + "\nFound: " + str(candidates)) print("Passed training data..\n\n") writer = DataWriter(solution_dir) usage = Usage() for sample in testing_data: usage.start() dna_string = sample[0][0] amino_acid = sample[0][1] dna = DNA(dna_string) dna.set_codon_table(codons_table) candidates = dna.get_dna_to_amino_acid_candidates(amino_acid) usage.end() writer.write_data((dna_string, amino_acid), candidates, usage.get_execution_time(), usage.get_memory_usage()) print("DNA:\n" + dna_string) print("Protein\n" + amino_acid) print("\n\nOutput") print("=====") print(str(len(candidates))) for substring in candidates: print(substring) print("\n\nExecution Time: " + str(usage.get_execution_time()) + " s") print("Memory Usage: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem3') solution_dir = os.path.join("Problems", "Problem3Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] dna = DNA(case[0]) k_mers = dna.most_frequent_k_mer(int(case[1])) if k_mers.sort() != case_output.sort(): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(k_mers))
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem11') solution_dir = os.path.join("Problems", "Problem11Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() scoring_matrix = data_reader.get_BLOSUM62_data() for train_i in range(0, len(output)): alpha_dna, beta_dna = test_cases[train_i] case_output = output[train_i] strings_algorithms = StringsAlgorithms(alpha_dna, beta_dna) align = strings_algorithms.alignment(_type='global', scoring_matrix=scoring_matrix) if align != case_output: raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(align)) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() alpha_dna, beta_dna = test_cases[test_i] strings_algorithms = StringsAlgorithms(alpha_dna, beta_dna) align = strings_algorithms.alignment(_type='global', scoring_matrix=scoring_matrix) usage.end() writer.write_data(test_i + 1, align, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + alpha_dna + "\n" + beta_dna + "\n") print("\n\nOutput") print("=====") print(align[0]) print(align[1][0]) print(align[1][1]) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem4') solution_dir = os.path.join("Problems", "Problem4Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] dna = DNA(case) reverse_complement = dna.reverse_complement() if reverse_complement != case_output: raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + reverse_complement)
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem9') solution_dir = os.path.join("Problems", "Problem9Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] genome = case[0] k = case[1][0] d = case[1][1] dna = DNA(genome) k_mers = dna.most_frequent_missmatched_k_mer(int(k), int(d)) if set(case_output) != set(k_mers): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(k_mers)) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output) + 1, len(test_cases)): usage.start() case = test_cases[test_i] genome = case[0] k = case[1][0] d = case[1][1] dna = DNA(genome) k_mers = dna.most_frequent_missmatched_k_mer(int(k), int(d)) usage.end() writer.write_data(test_i, k_mers, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + genome + "\n" + str(k) + "\n" + str(d)) print("\n\nOutput") print("=====") print('\n'.join(map(lambda v: str(v), k_mers))) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem7') solution_dir = os.path.join("Problems", "Problem7Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = np.array(output[train_i]) dna = DNA(case.strip()) min_skew_indices = dna.get_min_skew() if not np.array_equal(case_output, min_skew_indices): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(min_skew_indices))
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem5') solution_dir = os.path.join("Problems", "Problem5Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] pattern = case[0] genome = case[1] dna = DNA(genome) pattern_indices = dna.get_pattern_indices(pattern) if pattern_indices != case_output: raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + pattern_indices)
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem5') solution_dir = os.path.join("Problems", "Problem5Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] pattern = case[0] genome = case[1] dna = DNA(genome) pattern_indices = dna.get_pattern_indices(pattern) if pattern_indices != case_output: raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + pattern_indices) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() case = test_cases[test_i] pattern = case[0] genome = case[1] dna = DNA(genome) pattern_indices = dna.get_pattern_indices(pattern) usage.end() writer.write_data(test_i + 1, pattern_indices, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + pattern + "\n" + genome) print("\n\nOutput") print("=====") print(pattern_indices) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem10') solution_dir = os.path.join("Problems", "Problem10Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): alpha_dna, beta_dna = test_cases[train_i] case_output = output[train_i] strings_algorithms = StringsAlgorithms(alpha_dna, beta_dna) lcs = strings_algorithms.lcs('dp') print(len(lcs)) if len(case_output) != len(lcs): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(lcs)) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() alpha_dna, beta_dna = test_cases[test_i] strings_algorithms = StringsAlgorithms(alpha_dna, beta_dna) lcs = strings_algorithms.lcs('dp') print(len(lcs)) usage.end() writer.write_data(test_i + 1, lcs, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + alpha_dna + "\n" + beta_dna + "\n") print("\n\nOutput") print("=====") print(lcs) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem9') solution_dir = os.path.join("Problems", "Problem9Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] genome = case[0] k = case[1][0] d = case[1][1] dna = DNA(genome) k_mers = dna.most_frequent_missmatched_k_mer(int(k), int(d)) if set(case_output) != set(k_mers): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(k_mers))
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem16') solution_dir = os.path.join("Problems", "Problem16Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): k_mers = test_cases[train_i] graph = Graph() adj_list = graph.get_debruijn_graph(_type='k_mers', k_mers=k_mers) case_output = output[train_i] if sorted(adj_list.items()) != sorted(case_output.items()): raise Exception("Output not matched!\nExpecting: " + str(sorted(case_output.items())) + "\nFound: " + str(sorted(adj_list.items()))) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() k_mers = test_cases[test_i] adj_list = Graph().get_debruijn_graph(_type='k_mers', k_mers=k_mers) usage.end() writer.write_data(test_i + 1, adj_list, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + str(k_mers) + "\n") print("\n\nOutput") print("=====") print(adj_list) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem13') solution_dir = os.path.join("Problems", "Problem13Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): (k, genome) = test_cases[train_i] case_output = output[train_i] dna = DNA(genome) k_mers = dna.get_k_mers(int(k)) if sorted(case_output) != sorted(k_mers): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(k_mers)) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() (k, genome) = test_cases[test_i] dna = DNA(genome) k_mers = dna.get_k_mers(int(k)) usage.end() writer.write_data(test_i + 1, k_mers, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + k + "\n" + genome + "\n") print("\n\nOutput") print("=====") print(k_mers) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem17') solution_dir = os.path.join("Problems", "Problem17Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): dna = test_cases[train_i][0] k = test_cases[train_i][1][0] score_matrix = test_cases[train_i][1][1] case_output = output[train_i] most_probable_k_mer = DNA(dna).get_most_probable_k_mer(int(k), score_matrix) if most_probable_k_mer != case_output: raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(most_probable_k_mer)) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output) - 1, len(test_cases)): usage.start() dna = test_cases[test_i][0] k = test_cases[test_i][1][0] score_matrix = test_cases[test_i][1][1] most_probable_k_mer = DNA(dna).get_most_probable_k_mer(int(k), score_matrix) usage.end() writer.write_data(test_i + 1, most_probable_k_mer, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + str(dna) + "\n" + str(k) + "\n" + str(score_matrix)) print("\n\nOutput") print("=====") print(most_probable_k_mer) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem6') solution_dir = os.path.join("Problems", "Problem6Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] genome = case[0] k = case[1][0] l = case[1][1][0] t = case[1][1][1] dna = DNA(genome) clumps_patterns = dna.get_clumps_patterns(int(k), int(t), int(l)) if clumps_patterns.sort() != case_output.sort(): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + clumps_patterns)
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem15') solution_dir = os.path.join("Problems", "Problem15Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): k = test_cases[train_i][0] dna = test_cases[train_i][1] graph = Graph() adj_list = graph.get_debruijn_graph(_type='string', k=int(k), dna_string=dna) case_output = output[train_i] if sorted(adj_list.items()) != sorted(case_output.items()): raise Exception("Output not matched!\nExpecting: " + str(sorted(case_output.items())) + "\nFound: " + str(sorted(adj_list.items()))) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() k = test_cases[test_i][0] dna = test_cases[test_i][1] adj_list = Graph().get_debruijn_graph(_type='string', k=int(k), dna_string=dna) usage.end() writer.write_data(test_i + 1, adj_list, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + str(k) + "\n" + dna + "\n") print("\n\nOutput") print("=====") print(adj_list) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem7') solution_dir = os.path.join("Problems", "Problem7Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = np.array(output[train_i]) dna = DNA(case.strip()) min_skew_indices = dna.get_min_skew() if not np.array_equal(case_output, min_skew_indices): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(min_skew_indices)) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() case = test_cases[test_i] dna = DNA(case.strip()) min_skew_indices = dna.get_min_skew() usage.end() writer.write_data(test_i + 1, min_skew_indices, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + case + "\n") print("\n\nOutput") print("=====") print(list(min_skew_indices)) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem4') solution_dir = os.path.join("Problems", "Problem4Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): case = test_cases[train_i] case_output = output[train_i] dna = DNA(case) reverse_complement = dna.reverse_complement() if reverse_complement != case_output: raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + reverse_complement) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() case = test_cases[test_i] dna = DNA(case) reverse_complement = dna.reverse_complement() usage.end() writer.write_data(test_i + 1, reverse_complement, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + case) print("\n\nOutput") print("=====") print(reverse_complement) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem14') solution_dir = os.path.join("Problems", "Problem14Solution") data_reader = DataReader(problem_dataset_dir) test_cases, output = data_reader.get_data() for train_i in range(0, len(output)): k_mers = test_cases[train_i] case_output = output[train_i] graph = Graph(k_mers) adj_list = graph.get_overlap_graph() if sorted(adj_list.items()) != sorted(case_output.items()): raise Exception("Output not matched!\nExpecting: " + str(case_output) + "\nFound: " + str(adj_list)) print("Passed training data..") writer = DataWriter(solution_dir) usage = Usage() for test_i in range(len(test_cases) - len(output), len(test_cases)): usage.start() k_mers = test_cases[test_i] graph = Graph(k_mers) adj_list = graph.get_overlap_graph() usage.end() writer.write_data(test_i + 1, adj_list, usage.get_execution_time(), usage.get_memory_usage()) print("\n\nInput:\n" + str(k_mers) + "\n") print("\n\nOutput") print("=====") print(adj_list) print("\n") print("======") print("Execution Time: " + str(usage.get_execution_time()) + " s") print("Memory Used: " + str(usage.get_memory_usage()) + " MB")
def main(): problem_dataset_dir = os.path.join('Problems', 'Problem1') solution_dir = os.path.join("Problems", "Problem1Solution") data_reader = DataReader(problem_dataset_dir) training_data, testing_data = data_reader.get_data() codons_table = data_reader.get_rna_codon_table() for sample in training_data: rna_string = sample[0] output = sample[1] rna = RNA(rna_string) rna.set_codons_table(codons_table) amino_acid = rna.to_amino_acid() if amino_acid != output: raise Exception("Output not matched!\nExpecting: " + output + "\nFound: " + amino_acid) print("Passed training data..\n\n") writer = DataWriter(solution_dir) usage = Usage() for sample in testing_data: usage.start() rna_string = sample[0] rna = RNA(rna_string) rna.set_codons_table(codons_table) amino_acid = rna.to_amino_acid() usage.end() writer.write_data(rna_string, amino_acid, usage.get_execution_time(), usage.get_memory_usage()) print("RNA:\n" + rna_string) print("Protein:\n" + amino_acid) print("\n\nExecution Time: " + str(usage.get_execution_time()) + " s") print("Memory Usage: " + str(usage.get_memory_usage()) + " MB")
from data_reader.reader import DataReader from libs_utils.sklearn_util.sklearn_util import SklearnUtil from libs_utils.nltk_util.nltk_util import NltkUtil from preprocessing_util.preprocessing import Preprocessing import pickle file_path = "./data/train.csv" reader = DataReader(file_path) reader.get_all_train_data() questions_pairs, labels = reader.get_train_data() preprocessor = Preprocessing(questions_pairs) questions_pairs = preprocessor.remove_extra_whitespaces() print("End removing extra spaces") preprocessor = Preprocessing(questions_pairs) questions_pairs = preprocessor.remove_punctuation() print("End removing punctuations") preprocessor = Preprocessing(questions_pairs) questions_pairs = preprocessor.normalize_text() print("End normalization") preprocessor = Preprocessing(questions_pairs) tokenized_questions_pairs = preprocessor.tokenize() print("End tokenization") preprocessor = Preprocessing(questions_pairs) stemmed_questions_pairs = preprocessor.tokenize_with_stemming() print("End stemmed tokenization")