def download(oaid, rec):
    if ('<error code="idDoesNotExist">' not in rec) or ("hasFile:true" in rec):
        splits = re.split(
            '<dc:identifier xsi:type="tel:URL">|</dc:identifier>', rec)
        for url in splits:
            if "/download/" in url:
                r = requests.get(url)
                d = str(r.headers['Content-disposition'])
                filename = str(re.findall("filename=(.+)",
                                          d)[0]).replace('"', '')
                if not os.path.exists(downloadpath + oaid):
                    os.makedirs(downloadpath + oaid)
                    target = downloadpath + oaid + "/" + filename
                    wget.download(url, target)
                    # Wenn kein PDF vorhanden ist, wird zu 99% eine zip datei da sein. Diese soll entpackt und der inhalt validiert werden
                    if fnmatch.filter(os.listdir(downloadpath + oaid),
                                      "*.zip"):
                        with zipfile.ZipFile(target, 'r') as zip_ref:
                            zip_ref.extractall(downloadpath + oaid)
                    filelist = os.listdir(downloadpath + oaid)
                    for file in filelist:
                        if ("zip" or "json") not in file:
                            validator.main(downloadpath + oaid + "/" + file,
                                           downloadpath + oaid + "/")
                    return True
    else:
        return False
示例#2
0
 def test_relations_happy(self) -> None:
     """Tests the happy relations path."""
     argv = ["", "tests/data/relations.yaml"]
     with unittest.mock.patch('sys.argv', argv):
         ret: List[int] = []
         with unittest.mock.patch('sys.exit', mock_sys_exit(ret)):
             validator.main()
             self.assertEqual(ret, [])
示例#3
0
 def test_relations_happy(self) -> None:
     """Tests the happy relations path."""
     paths = [
         "tests/data/relations.yaml",
         "tests/data/relation-gazdagret-filter-invalid-good.yaml",
         "tests/data/relation-gazdagret-filter-invalid-good2.yaml",
     ]
     for path in paths:
         argv = ["", path]
         with unittest.mock.patch('sys.argv', argv):
             ret: List[int] = []
             with unittest.mock.patch('sys.exit', mock_sys_exit(ret)):
                 validator.main()
                 self.assertEqual(ret, [])
def main():

    # get the inputs
    input_truth = '/mnt/work/input/truth'
    input_test = '/mnt/work/input/test'
    string_ports = '/mnt/work/input/ports.json'

    # create output directory
    out_path = '/mnt/work/output/data'
    if os.path.exists(out_path) is False:
        os.makedirs(out_path)

    # read the inputs
    with open(string_ports) as ports:
        inputs = json.load(ports)
    iou = inputs.get('iou', 'False')
    out_csv = inputs.get('out_csv', 'results.csv')

    # convert the inputs to the correct dtypes
    iou = convert_type(iou, bool, 'Boolean')
    out_csv = convert_type(out_csv, str, 'String')

    # get the shp or geojson in the input truth folder
    truth_vectors = glob.glob1(input_truth, '*.shp')
    truth_vectors += glob.glob1(input_truth, '*.geojson')
    if len(truth_vectors) == 0:
        raise ValueError("No shp or geojson files found in input data port 'truth'")
    if len(truth_vectors) > 1:
        raise ValueError("Multiple shp or geojson found in input data port 'truth'")
    in_truth = os.path.join(input_truth, truth_vectors[0])

    # get the shp or geojson in the input test folder
    test_vectors = glob.glob1(input_test, '*.shp')
    test_vectors += glob.glob1(input_test, '*.geojson')
    if len(test_vectors) == 0:
        raise ValueError("No shp or geojson files found in input data port 'test'")
    if len(test_vectors) > 1:
        raise ValueError("Multiple shp or geojson found in input data port 'test'")
    in_test = os.path.join(input_test, test_vectors[0])

    # set the output file path
    out = os.path.join(out_path, out_csv)

    print("Validating test data against truth data...")
    # run the processing
    validator.main([in_test,
                    in_truth,
                    out,
                    '-u', iou])
    print("Validation process completed successfully.")
示例#5
0
 def assert_failure_msg(self, path: str, expected: str) -> None:
     """Asserts that a given input fails with a given error message."""
     # Set up arguments.
     argv = ["", path]
     with unittest.mock.patch('sys.argv', argv):
         # Capture standard output.
         buf = io.StringIO()
         with unittest.mock.patch('sys.stdout', buf):
             # Capture exit code.
             ret: List[int] = []
             with unittest.mock.patch('sys.exit', mock_sys_exit(ret)):
                 validator.main()
                 self.assertEqual(ret, [1])
                 buf.seek(0)
                 self.assertEqual(buf.read(), expected)
示例#6
0
 def test_relation_happy(self) -> None:
     """Tests the happy relation path."""
     # Set up arguments.
     argv = ["", "tests/data/relation-gazdagret.yaml"]
     with unittest.mock.patch('sys.argv', argv):
         # Capture standard output.
         buf = io.StringIO()
         with unittest.mock.patch('sys.stdout', buf):
             # Capture exit code.
             ret: List[int] = []
             with unittest.mock.patch('sys.exit', mock_sys_exit(ret)):
                 validator.main()
                 self.assertEqual(ret, [])
                 buf.seek(0)
                 self.assertEqual(buf.read(), "")
示例#7
0
 def test_relations_missing_osmrelation(self) -> None:
     """Tests the missing-osmrelation relations path."""
     # Set up arguments.
     argv = ["", "tests/data/relations-missing-osmrelation/relations.yaml"]
     with unittest.mock.patch('sys.argv', argv):
         # Capture standard output.
         buf = io.StringIO()
         with unittest.mock.patch('sys.stdout', buf):
             # Capture exit code.
             ret: List[int] = []
             with unittest.mock.patch('sys.exit', mock_sys_exit(ret)):
                 validator.main()
                 self.assertEqual(ret, [1])
                 buf.seek(0)
                 expected = "failed to validate tests/data/relations-missing-osmrelation/relations.yaml"
                 expected += ": missing key 'gazdagret.osmrelation'\n"
                 self.assertEqual(buf.read(), expected)
示例#8
0
 def assert_failure_msg(self, path: str, expected: str) -> None:
     """Asserts that a given input fails with a given error message."""
     argv = ["", path]
     buf = io.StringIO()
     ret = validator.main(argv, buf)
     self.assertEqual(ret, 1)
     buf.seek(0)
     self.assertEqual(buf.read(), expected)
示例#9
0
 def test_relation_happy(self) -> None:
     """Tests the happy relation path."""
     # Set up arguments.
     argv = ["", "tests/data/relation-gazdagret.yaml"]
     buf = io.StringIO()
     ret = validator.main(argv, buf)
     self.assertEqual(ret, 0)
     buf.seek(0)
     self.assertEqual(buf.read(), "")
示例#10
0
def try_password(password):
    """
    Função para mostrar a senha e verifica se ela é verdadeira.
    """

    if app.args.show == "true":
        print("\r Trying %s..." % password, end="")

    return True if validator.main(password,
                                  app.args.command) == app.args.code else False
示例#11
0
 def test_relations_missing_osmrelation(self) -> None:
     """Tests the missing-osmrelation relations path."""
     # Set up arguments.
     argv = ["", "tests/data/relations-missing-osmrelation/relations.yaml"]
     buf = io.StringIO()
     ret = validator.main(argv, buf)
     self.assertEqual(ret, 1)
     buf.seek(0)
     expected = "failed to validate tests/data/relations-missing-osmrelation/relations.yaml"
     expected += ": missing key 'gazdagret.osmrelation'\n"
     self.assertEqual(buf.read(), expected)
示例#12
0
 def test_relations_happy(self) -> None:
     """Tests the happy relations path."""
     paths = [
         "tests/data/relations.yaml",
         "tests/data/relation-gazdagret-filter-invalid-good.yaml",
         "tests/data/relation-gazdagret-filter-invalid-good2.yaml",
         "tests/data/relation-gazdagret-filter-valid-good.yaml",
         "tests/data/relation-gazdagret-filter-valid-good2.yaml",
     ]
     for path in paths:
         argv = ["", path]
         ret = validator.main(argv, io.StringIO())
         self.assertEqual(ret, 0)
示例#13
0
def main():
	print "Options: 1) Enter a word to check its  best spelling suggestion"
	print "2) Enter validate() to validate an input"
	print "3) Enter exit() to exit the program"
	while 1:
		got =0
		
		inputstr =raw_input("> ").lower()
		if 'exit()' in inputstr:
			break
		elif 'validate()' in inputstr:
			ipst=validator.main()
			validate(ipst)
			continue
		else:

			if check_presence(inputstr):
				got =1
				print inputstr
				continue
			
			extr_unique = extract_unique(inputstr)
			if check_presence(extr_unique):
				got =1
				print extr_unique
				continue

			poss_dups = possibile_dups(inputstr)
			for p in poss_dups:
				if check_presence(p):
					got =1
					print p
					break
				
			for p in poss_dups:
				for w in vowels(p):
					if check_presence(w) and got==0:
						got =1
						print w
						break
		if got ==0:
			print "NO SUGGESTION"
			continue
示例#14
0
def main(file_name, **kwargs):
    # check each file against the registry
    # determine if its spam, a duplicate, or should be ingested
    # copy the file to the appropriate place, check for equal file size, and delete one of the files appropriately
    metadata = {}
    write_path = instance_guid = stage = header = ""
    file_type = "raw"
    # json.dumps(metadata)

    # Log the PID to help in debugging
    logger.info('Pid : ' + str(os.getpid()))
    try:
        # attempt to get the registry entry.  If Alfred isn't working properly we'll get a connection error
        if file_name.startswith('sbx_'):
            # asking the registry for sandbox file and stripping "sbx_" off the file name, keys don't have the prefix
            metadata = reg.get_metadata(file_name[4:], stage='sandbox')
        else:
            metadata = reg.get_metadata(file_name)

    except requests.ConnectionError as e:
        # log response error
        logger.error('Failed to connect to Alfred : ' + str(e))
        exit(e)

    if 'stage' in metadata:
        stage = metadata['stage']

    # get the count of the number of rows in the source file
    row_count = file_len(landing_zone + '/' + file_name)
    logger.info('row count = ' + str(row_count))

    if 'file' in metadata and metadata['file'] != {}:
        # a registry entry exists for the file, process it
        logger.info("Moving " + file_name + " to hdfs://" +
                    reg.file_path(metadata, **kwargs))

        # set the write path based on the metadata
        write_path = reg.file_path(metadata, **kwargs)
        logger.info("Moving " + file_name + " to " + write_path)

        if stage == 'sandbox' and hdfs.path.exists(write_path + '/' +
                                                   file_name):
            # in the case of sandbox files previous data is always overwritten
            logging.info("Sandbox file already exists, overwriting")
            # Delete from HDFS is not strictly needed if the table was created as external
            hdfs.rmr(write_path + '/' + file_name)
            # set up a hive connection
            hive = validator.Hive()
            # use the hive connection to delete the sandbox table
            hive.drop_table(metadata, stage=stage)
            # close the hive connection
            hive = None

        # check to make sure the file doesn't already exist
        if not hdfs.path.exists(write_path + '/' + file_name):
            # if it doesn't, write it to the appropriate location
            hdfs.put(landing_zone + '/' + file_name,
                     write_path + '/' + file_name)
            # create second copy for work table unless its a sandbox file
            if stage != 'sandbox':
                # create work copy write path
                work_write_path = reg.file_path(metadata,
                                                type='work',
                                                **kwargs)
                # delete the work file if there is already one present
                if hdfs.path.exists(work_write_path):
                    logger.info("Deleting existing work files at  " +
                                work_write_path)
                    hdfs.rmr(work_write_path)
                # write the file to the work file location
                hdfs.put(landing_zone + '/' + file_name,
                         work_write_path + '/' + file_name)
            else:
                # if this is a sandbox file, we might need the header row, its far easier to get this now than from hdfs
                header = get_header(file_name)
            # register that the raw file was written
            instance_guid = reg.register_raw(metadata, file_name, file_type,
                                             row_count)
        else:
            # if the file does exist, its treated as a duplicate
            logger.info("Duplicate file")
            file_type = "duplicate"

            # set up duplicate write path
            write_path = reg.dup_file_path(
                metadata)  # + '/' + metadata['file']['key']

            #check to see if its a duplicate of an existing duplicate
            if hdfs.path.exists(write_path + '/' + file_name):
                # delete existing duplicate and write the new one.
                logging.info("duplicate file already exists, overwriting")
                hdfs.rmr(write_path + '/' + file_name)
                hdfs.put(landing_zone + '/' + file_name,
                         write_path + '/' + file_name)
                logger.info("writing duplicate file " + write_path + '/' +
                            file_name)
                reg.register_raw(metadata, file_name, file_type, row_count)

            else:
                # first time duplicates just get written
                hdfs.put(landing_zone + '/' + file_name,
                         write_path + '/' + file_name)
                logger.info("writing duplicate file " + write_path + '/' +
                            file_name)
                reg.register_raw(metadata, file_name, file_type, row_count)

    else:
        # no registry entry for this file, move it to spam
        file_type = "spam"

        # set up write path for spam
        write_path = reg.spam_file_path(metadata)
        logger.info("Moving " + file_name + " to " + write_path + '/' +
                    file_name)

        #check to see if its a duplicate of an existing spam file
        if hdfs.path.exists(write_path + '/' + file_name):
            # delete existing spam and write the new one.
            logging.info("spam file already exists, overwriting")
            hdfs.rmr(write_path + '/' + file_name)
            hdfs.put(landing_zone + '/' + file_name,
                     write_path + '/' + file_name)
            logger.info("writing spam file " + write_path + '/' + file_name)
            reg.register_raw(metadata, file_name, file_type, row_count)
        else:
            # first time spam gets written as normal
            hdfs.put(landing_zone + '/' + file_name,
                     write_path + '/' + file_name)
            logger.info("writing spam file " + write_path + '/' + file_name)
            reg.register_raw(metadata, file_name, file_type, row_count)

    # confirm that source file and target file have the same size, regardless of spam, duplicate or normal
    if hdfs.path.exists(write_path + '/' + file_name) and \
            hdfs.path.getsize(write_path + '/' + file_name) == os.stat(landing_zone + '/' + file_name).st_size:
        # if the file sizes match, delete the source file
        os.remove(landing_zone + '/' + file_name)
        logger.info("Landing zone file removed " + landing_zone + '/' +
                    file_name)
    else:
        # if the file sizes do not match, delete the target file and rename the source file so it doesn't get reprocessed repeatedly
        logger.error(
            "Source and target file sizes didn't match, not deleting source.")
        hdfs.rmr(write_path + '/' + file_name)
        os.rename(landing_zone + '/' + file_name,
                  landing_zone + '/' + file_name + '.err')
        raise ValueError("Source and target file sizes don't match")

    # copy only is an option set up in case there's ever a reason not to process beyond moving the file to HDFS
    if 'copy_only' not in kwargs or not kwargs['copy_only']:
        if file_type == "raw":  # raw, meaning not spam or duplicate. No reason to validate those
            if stage != 'sandbox':
                # if its not a sandbox file proceed with full validation
                logger.info("Validate " + file_name)
                validator.main(file_name, instance_guid, metadata)
            elif stage == 'sandbox':
                # if it is a sandbox file, we need to mark it as such so validator only creates the table
                logger.info("Sandbox validate " + file_name)
                validator.main(file_name,
                               instance_guid,
                               metadata,
                               header=header,
                               stage=stage)

    # log that this PID is ending
    logger.info('Pid ending : ' + str(os.getpid()))