def _extract_zipjar(path_in, path_out): message = "No errors detected" cmd_test = """unzip -qtP ' ' {0}""".format(path_in) result_test = common.exec_cmd(cmd_test) if result_test[1] is not None and result_test[1] != "": print "ERROR: container-extraction-3: " + result_test[1] if result_test[0].startswith(message): cmd = """unzip -q {0} -d {1}""".format(path_in, path_out) result = common.exec_cmd(cmd) if result[1] is not None and result[1] != "": print "ERROR: container-extraction-4: " + result[1]
def process_errorlog(path_out, BATCH, path_log): cnf = config.ConfigReader() max = 20 iteration = 1 while True: path_thislog = path_log + "-" + str(iteration) # continue only if current log exists (i.e. new log file has been created at previous iteration) or you iterations < max if not os.path.exists(path_thislog) or iteration > max: break print "PROCESSING iteration " + str(iteration) iteration += 1 path_nextlog = path_log + "-" + str(iteration) cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 2 {3} {4} {5}".format( cnf.get('JAVA_CLASSPATH'), path_thislog, path_out, BATCH, path_nextlog, common.CONFIG_FILE_NAME) result = common.exec_cmd(cmd) if len(result[0]) > 0: print "\nRESULTS OF JVM EXECUTION" print "{0}\n".format(result[0]) if len( result[1] ) > 0: #we only print out errors in execution; the rest is logged in the java file print "\nERRORS FROM JVM EXECUTION" print "{0}\n".format(result[1]) print "COMPLETED at iteration " + str(iteration - 1)
def process_sources(rootpath_in, path_out, BATCH, path_pathlog): cnf = config.ConfigReader() count = 1 total = len(os.listdir(rootpath_in)) dirs = os.listdir(rootpath_in) p = Pool(PARALLEL_PROC) pool_args = itertools.izip(dirs, itertools.repeat(rootpath_in), itertools.repeat(path_out), itertools.repeat(BATCH), itertools.repeat(path_pathlog)) p.map(process_dir_star, pool_args) for d in os.listdir(rootpath_in): print "Processing source {0} of {1} [dir {2}]".format(count, total, d) count += 1 path_in = os.path.join(rootpath_in, d) print "From %s to %s" % (path_in, path_out) #cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4}".format(self._cnf.JAVA_CLASSPATH, path_in, path_out, BATCH, path_pathlog) cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4} {5}".format( cnf.get('JAVA_CLASSPATH'), path_in, path_out, BATCH, path_pathlog, common.CONFIG_FILE_NAME) result = common.exec_cmd(cmd) if len( result[1] ) > 0: #we only print out errors in execution; the rest is logged in the java file print "\nERRORS FROM JVM EXECUTION from dir {0}:".format(d) print "{0}\n".format(result[1])
def run(): rootpath = common.get_path_files_processed() db = database.DbTool() db.open() fileitems = db.get_fileitems() total = len(fileitems) count = 0 for r in fileitems: item_id = r[0] source_id = r[1] extension = r[2] if count % 1000 == 0: print 'Processing item {0} of {1} [id# {2}]'.format( count, total, item_id) db.commit() count += 1 f_path = os.path.join(rootpath, str(source_id), str(item_id) + extension) cmd = "md5sum \"{0}\"".format(f_path) result = common.exec_cmd(cmd) data = result[0].split() checksum = data[0] db.update_file_md5sum(item_id, checksum) db.commit() db.close()
def process_non_pst(db, path_in, item_id, source_id, level, type_id, extension, path_in_dir): path_out = common.get_path_files_original_container( item_id) # path for extracted files os.mkdir(path_out) _extract_files(type_id, path_in, path_out) # change permissions for further program execution cmd = "chmod 770 -R {0}".format(path_out) result = common.exec_cmd(cmd) # check if any files were extracted extracted_files = 0 for root, dirs, files in os.walk(path_out): extracted_files += len(files) if extracted_files == 0: db.update_container(item_id, 0) # mark as NOT extracted os.rmdir(path_out) # remove container directory else: db.update_container(item_id, 1) # mark as extracted source_dir = path_out # dir with extracted files target_dir = path_in_dir # dir for processed files _process_non_pst_extracted( db, item_id, source_id, level + 1, source_dir, target_dir) # add to db and copy to output location
def _extract_gzip(path_in, path_out): root, file = os.path.split(path_in) filename, ext = os.path.splitext(file) path_out_file = os.path.join(path_out, filename) cmd = """gunzip -c {0} > {1}""".format(path_in, path_out_file) result = common.exec_cmd(cmd) if result[1] is not None and result[1] != "": print "ERROR: container-extraction-1: " + result[1]
def _create_output_dirs(): cnf = config.ConfigReader() outputRoot = cnf.get('OUTPUT_ROOT') #this wipes out the contents of the output directory if os.path.exists(outputRoot): cmd = "chmod -R 777 {0}/*".format( outputRoot) #chmod to make them removable common.exec_cmd(cmd) cmd = "rm {0}/* -rf".format(outputRoot) #remove common.exec_cmd(cmd) path_files = "{0}files/".format(outputRoot) if not os.path.exists(path_files): os.mkdir(path_files) path_files_orig = "{0}files/original_by_container/".format(outputRoot) if not os.path.exists(path_files_orig): os.mkdir(path_files_orig) path_files_proc = "{0}files/processed/".format(outputRoot) if not os.path.exists(path_files_proc): os.mkdir(path_files_proc) path_files_text = "{0}files/text/".format(outputRoot) if not os.path.exists(path_files_text): os.mkdir(path_files_text) path_pst = "{0}pstitems/".format(outputRoot) if not os.path.exists(path_pst): os.mkdir(path_pst) path_pst_text = "{0}pstitems/text/".format(outputRoot) if not os.path.exists(path_pst_text): os.mkdir(path_pst_text) path_tikalogs = "{0}{1}/".format(outputRoot, cnf.get('TIKA_LOG_RELPATH')) if not os.path.exists(path_tikalogs): os.mkdir(path_tikalogs)
def process_dir(d, rootpath_in, path_out, BATCH, path_pathlog): cnf = config.ConfigReader() path_in = os.path.join(rootpath_in, d) print "From %s to %s" % (path_in, path_out) cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4} {5}".format( cnf.get('JAVA_CLASSPATH'), path_in, path_out, BATCH, path_pathlog, common.CONFIG_FILE_NAME) print cmd result = common.exec_cmd(cmd) if len( result[1] ) > 0: #we only print out errors in execution; the rest is logged in the java file print "\nERRORS FROM JVM EXECUTION from dir {0}:".format(d) print "{0}\n".format(result[1])
def run(): timer_start = time.time() md5 = {} rootpath = common.get_path_files_processed() count = 1 total = len(os.listdir(rootpath)) db = database.DbTool() db.open() for d in os.listdir(rootpath): print "[fi] Processing source {0} of {1} [dir {2}]".format(count, total, d) count += 1 d_path = os.path.join(rootpath, d, "*") cmd = "md5sum {0}".format(d_path); result = common.exec_cmd(cmd) lines = result[0].split("\n") for line in lines: if len(line) > 0: data = line.split() checksum = data[0] filepath = data[1] head, tail = os.path.split(filepath) item_id, ext = os.path.splitext(tail) if checksum in md5: canon_id = md5[checksum] db.create_item_relationship(item_id, common.IS_DUPLICATE_RELTYPE_ID, canon_id, "md5sum") else: md5[checksum] = item_id db.commit() #commit for each source db.close() print common.display_elapsed(timer_start, "STEP COMPLETED: dedup fileitems")
def _extract_tar(path_in, path_out): pass cmd = """tar -xf {0} -C {1}""".format(path_in, path_out) result = common.exec_cmd(cmd) if result[1] is not None and result[1] != "": print "ERROR: container-extraction-2: " + result[1]