예제 #1
0
    def _write_collection_xml_files(self):
        print "Writing collection xml files"

        collection_name = config.ConfigReader().get('COLLECTION_NAME')

        sb = []
        sb.append('<?xml version="1.0" ?>')
        sb.append('\n<collection xmlns:xi="http://www.w3.org/2001/XInclude">')
        sb.append('\n\t<name>{0}</name>'.format(collection_name))
        sb.append('\n\t<timestamp>{0}</timestamp>'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        sb.append('\n\t<custodians>')
        
        for s in self._sources:
            src_id    = s[0]
            src_name  = s[1]
            src_size  = s[2]                    
            itemcount = s[3]

            src_id_str = mod_comm.get_source_dir(src_id)
            src_name = mod_comm.xml_filter_encoding(src_name) #just a safeguard
            src_size = mod_comm.format_size(src_size)

            self._write_custodian_xml_file(src_id, src_id_str, src_name, src_size, itemcount)

            sb.append('\n\t\t<!-- id="{0}" items="{1}" source_file_name="{2}" source_file_size="{3}" -->'.format(src_id, itemcount, src_name, src_size))
            sb.append('\n\t\t<xi:include href="custodians/{0}.xml" />'.format(src_id_str))  
        
        sb.append('\n\t</custodians>')
        sb.append('\n</collection>')
        sb.append('\n\n')

        filepath = os.path.join(self.path, 'collection.xml')
        self._write_file(filepath, sb)
예제 #2
0
파일: extract.py 프로젝트: kamwoods/pluto
def process_errorlog(path_out, BATCH, path_log):
    cnf = config.ConfigReader()
    max = 20
    iteration = 1
    while True:
        path_thislog = path_log + "-" + str(iteration)
        # continue only if current log exists (i.e. new log file has been created at previous iteration) or you iterations < max
        if not os.path.exists(path_thislog) or iteration > max: break

        print "PROCESSING iteration " + str(iteration)
        iteration += 1
        path_nextlog = path_log + "-" + str(iteration)

        cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 2 {3} {4} {5}".format(
            cnf.get('JAVA_CLASSPATH'), path_thislog, path_out, BATCH,
            path_nextlog, common.CONFIG_FILE_NAME)
        result = common.exec_cmd(cmd)

        if len(result[0]) > 0:
            print "\nRESULTS OF JVM EXECUTION"
            print "{0}\n".format(result[0])

        if len(
                result[1]
        ) > 0:  #we only print out errors in execution; the rest is logged in the java file
            print "\nERRORS FROM JVM EXECUTION"
            print "{0}\n".format(result[1])

    print "COMPLETED at iteration " + str(iteration - 1)
예제 #3
0
파일: extract.py 프로젝트: kamwoods/pluto
def process_sources(rootpath_in, path_out, BATCH, path_pathlog):
    cnf = config.ConfigReader()
    count = 1
    total = len(os.listdir(rootpath_in))

    dirs = os.listdir(rootpath_in)
    p = Pool(PARALLEL_PROC)

    pool_args = itertools.izip(dirs, itertools.repeat(rootpath_in),
                               itertools.repeat(path_out),
                               itertools.repeat(BATCH),
                               itertools.repeat(path_pathlog))

    p.map(process_dir_star, pool_args)

    for d in os.listdir(rootpath_in):
        print "Processing source {0} of {1} [dir {2}]".format(count, total, d)
        count += 1

        path_in = os.path.join(rootpath_in, d)
        print "From %s to %s" % (path_in, path_out)
        #cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4}".format(self._cnf.JAVA_CLASSPATH, path_in, path_out, BATCH, path_pathlog)

        cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4} {5}".format(
            cnf.get('JAVA_CLASSPATH'), path_in, path_out, BATCH, path_pathlog,
            common.CONFIG_FILE_NAME)

        result = common.exec_cmd(cmd)

        if len(
                result[1]
        ) > 0:  #we only print out errors in execution; the rest is logged in the java file
            print "\nERRORS FROM JVM EXECUTION from dir {0}:".format(d)
            print "{0}\n".format(result[1])
예제 #4
0
파일: extract.py 프로젝트: kamwoods/pluto
def store_text(path):
    cnf = config.ConfigReader()
    p_type = common.PROPERTYTYPE_FILEITEM
    p_name = cnf.get('EXTRACTED_TEXT_PROPERTY_NAME')

    db = database.DbTool()
    db.open()

    property_id = db.get_property_id(p_type, p_name)

    total = len(os.listdir(path))
    count = 1

    for f in os.listdir(path):
        if count % 100 == 0:
            print "Processing file {0} of {1} [{2}]".format(count, total, f)
            db.commit()  #commit every 100: extracted texts can be large
        count += 1

        f_path = os.path.join(path, f)
        if os.path.getsize(f_path) > 0:
            item_id = int(f)
            file = open(f_path)
            value = file.read()
            file.close()
            db.create_item_property(item_id, property_id, value, BATCH)

    db.commit()
    db.close()
예제 #5
0
파일: extract.py 프로젝트: kamwoods/pluto
def process_dir(d, rootpath_in, path_out, BATCH, path_pathlog):
    cnf = config.ConfigReader()
    path_in = os.path.join(rootpath_in, d)
    print "From %s to %s" % (path_in, path_out)
    cmd = "java -cp {0} pluto.TikaExtractor {1} {2} 1 {3} {4} {5}".format(
        cnf.get('JAVA_CLASSPATH'), path_in, path_out, BATCH, path_pathlog,
        common.CONFIG_FILE_NAME)
    print cmd
    result = common.exec_cmd(cmd)

    if len(
            result[1]
    ) > 0:  #we only print out errors in execution; the rest is logged in the java file
        print "\nERRORS FROM JVM EXECUTION from dir {0}:".format(d)
        print "{0}\n".format(result[1])
예제 #6
0
def _create_output_dirs():

    cnf = config.ConfigReader()
    outputRoot = cnf.get('OUTPUT_ROOT')

    #this wipes out the contents of the output directory
    if os.path.exists(outputRoot):
        cmd = "chmod -R 777 {0}/*".format(
            outputRoot)  #chmod to make them removable
        common.exec_cmd(cmd)

        cmd = "rm {0}/* -rf".format(outputRoot)  #remove
        common.exec_cmd(cmd)

    path_files = "{0}files/".format(outputRoot)
    if not os.path.exists(path_files):
        os.mkdir(path_files)

    path_files_orig = "{0}files/original_by_container/".format(outputRoot)
    if not os.path.exists(path_files_orig):
        os.mkdir(path_files_orig)

    path_files_proc = "{0}files/processed/".format(outputRoot)
    if not os.path.exists(path_files_proc):
        os.mkdir(path_files_proc)

    path_files_text = "{0}files/text/".format(outputRoot)
    if not os.path.exists(path_files_text):
        os.mkdir(path_files_text)

    path_pst = "{0}pstitems/".format(outputRoot)
    if not os.path.exists(path_pst):
        os.mkdir(path_pst)

    path_pst_text = "{0}pstitems/text/".format(outputRoot)
    if not os.path.exists(path_pst_text):
        os.mkdir(path_pst_text)

    path_tikalogs = "{0}{1}/".format(outputRoot, cnf.get('TIKA_LOG_RELPATH'))
    if not os.path.exists(path_tikalogs):
        os.mkdir(path_tikalogs)