def test_set_output_pass(): attributes = [Attribute("att1", AttributeType.TEXT)] sets = [Set("set1", SetType.OUTPUT, attributes)] output = [Set("set4", SetType.OUTPUT, attributes)] expected_result = [x.get_specification() for x in output] transformation = Transformation("tf1", sets=sets) transformation.output = output assert transformation.output == expected_result
def test_get_specification_pass(): tag = "tf1" sets = [ Set("set1", SetType.INPUT, [Attribute("att1", AttributeType.TEXT)]) ] expected_result = { "sets": [x.get_specification() for x in sets], "tag": tag } transformation = Transformation(tag, sets) assert transformation.get_specification() == expected_result
def test_set_sets_pass(): attributes = [Attribute("att1", AttributeType.TEXT)] sets = [Set("set1", SetType.INPUT, attributes)] new_sets = [ Set("set1", SetType.INPUT, attributes), Set("set2", SetType.OUTPUT, attributes) ] expected_result = [x.get_specification() for x in new_sets] transformation = Transformation("tf1", sets=sets) transformation.sets = new_sets assert transformation.sets == expected_result
def test_get_input_pass(): attributes = [Attribute("att1", AttributeType.TEXT)] sets = [Set("set1", SetType.INPUT, attributes)] expected_result = [sets[0].get_specification()] transformation = Transformation("tf1", sets=sets) assert transformation.input == expected_result
} } #PROVENIÊNCIA ############################ dataflow_tag = "prov-df-{}".format(aggreg_unit) df = Dataflow(dataflow_tag) logger.info('Inicializando o processador Spark') processador = ProcessadorSparkClass(logger, spark, df, dataflow_tag) ##PROVENIÊNCIA PROSPECTIVA #Transformação para extrair o primeiro stats: ExtrairStats1 tf1 = Transformation('load_data') ## Usando o nome da task spark tf1_input = Set("i{}1".format('load_data'), SetType.INPUT, [ Attribute("datafiles", AttributeType.TEXT), Attribute("tables", AttributeType.TEXT), Attribute("currenttime", AttributeType.TEXT), Attribute("aggregationunit", AttributeType.TEXT), Attribute("csvseparator", AttributeType.TEXT) ]) tf1_output = Set("o{}1".format('load_data'), SetType.OUTPUT, [ Attribute("currenttime", AttributeType.TEXT), Attribute("elapsedtime", AttributeType.NUMERIC) ])
#------------------------------------- #dirin_do_ficheiro = sys.argv[0] #dirin_arg_pas = sys.argv[0:] ###print "O nome do diretorio de entrada do ficheiro e: " + dirin_do_ficheiro ###print "E os argumentos passados sao: " + str(dirin_arg_pas) ############################ #PROVENIÊNCIA ############################ dataflow_tag = "mafft-df" df = Dataflow(dataflow_tag) ##PROVENIÊNCIA PROSPECTIVA #Transformação para extrair nome dos arquivos: ExtrairNome tf1 = Transformation("ExtrairNome") tf1_input = Set("iExtrairNome", SetType.INPUT, [Attribute("DIRIN_FILE", AttributeType.FILE)]) tf1_output = Set("oExtrairNome", SetType.OUTPUT, [Attribute("FASTA_FILE", AttributeType.FILE), Attribute("MAFFT_FILE", AttributeType.FILE)]) tf1.set_sets([tf1_input, tf1_output]) df.add_transformation(tf1) #Transformação para ler o arquivo e contar o numero de sequencias: ContarSequencias tf2 = Transformation("ContarSequencias") tf2_input = Set("iContarSequencias", SetType.INPUT, [Attribute("FASTA_FILE", AttributeType.FILE)])#leitor file-fasta/att text-file tf2_output = Set("oContarSequencias", SetType.OUTPUT, [Attribute("NUMERO_SEQUENCIAS", AttributeType.NUMERIC)]) tf2.set_sets([tf2_input, tf2_output])