Exemplo n.º 1
0
def list_archives(archivelist):
    for archive in archivelist:
        if (os.path.isdir(archive)):
            xmlFiles = numsort.sorted_copy(glob(archive + '/*.xml'))
            for f in xmlFiles:
                print f
        else:
            corpus_exists_or_die(archive)
            list_archive(archive)
Exemplo n.º 2
0
def process_directory(dir, function, recursive=0, passData=True, **keywords):
    """Apply FUNCTION on the data of every .xml file in directory DIR.

    FUNCTION will be called with arguments DATA and FILENAME and any
    optional keyword arguments

    When RECURSIVE is *not* set, only the .xml files in DIR are
    processed.

    When RECURSIVE is true, this function will also look for compact
    corpora and will dive into subdirectories.

    When DIR contains .xml files, no further processing is done except
    for the .xml files.  (implied by the structure of compact corpora)
    """

    # xmlfiles
    xmlfiles = glob(os.path.join(dir, "*.xml"))
    xmlfiles = numsort.sorted_copy(xmlfiles)
    for file in xmlfiles:
        if passData:
            fp = open(file)
            data = fp.read()
            fp.close()
            apply(function, (data, file), keywords)
        else:
            apply(function, (file, ), keywords)

    # als we xmlfiles gevonden hebben hoeven niet verder te kijken
    if not recursive or xmlfiles:
        return

    # dictzip files
    dictzip_files = glob(os.path.join(dir, "*.data.dz"))
    dictzip_files.sort()

    for file in dictzip_files:
        # alleen die compacte corpora waar geen directory voor is
        dirname = get_corpus_noext(file)
        if not os.path.isdir(dirname):
            process_compact_corpus(file, function, passData, **keywords)

    # directories
    # FIXME?: hier ook symlinks overslaan??  Nee toch??
    subdirs = os.listdir(dir)
    subdirs = map(lambda x: os.path.join(dir, x), subdirs)
    subdirs = filter(os.path.isdir, subdirs)
    subdirs.sort()

    for subdir in subdirs:
        process_directory(subdir, function, recursive, passData, **keywords)
Exemplo n.º 3
0
def _thumbnails(directory,check_display):
    # parameters:
    #   check_display is a callback function
        
    # 
    # directory listing
    #
    import numsort
    files = os.listdir(os.path.join(RESULT_PATH,directory))
    files = numsort.sorted_copy(files)

    #
    # generate html
    #
    output = ""
    for file in files:
        if check_display(file):
            heatmap_thumb = "review/%(directory)s/%(file)s_th.png" % locals()
            heatmap_full  = "review/%(directory)s/%(file)s.png" % locals()
            output += '''<div class="box">
    <a href="%(heatmap_full)s"><img src="%(heatmap_thumb)s" border="0"></a><br/>
    <font size="-1">%(file)s</font><br/>
</div>
''' % locals()
 
    return """
    <style>
        .box {
                float: left;
                padding: 5px;
                width: 160px;
                align: center;
                font-size: 0.99em;

              }
    </style>
    %s
""" % output
Exemplo n.º 4
0
def update_archive(sourcedir, targetdir, only_newer=0, remove=0):
    """Update the compact corpus in TARGETDIR with the contents in
    SOURCEDIR.

    The .xml files in SOURCEDIR will be added to the compact corpus.

    The compact corpus files will be TARGETDIR/SOURCEDIR.{index,data.dz}

    When ONLY_NEWER is set, only creates a new compact corpus when the
    source directory is newer. 
    
    When REMOVE is set, the source files in SOURCEDIR will be deleted afterwards.

    A temporary backup will be made of the existing .index and .data.dz files.
    (.bak suffix added)
    """

    verbose = 0

    sourcefiles = glob(os.path.join(sourcedir, "*.xml"))
    if not sourcefiles:
        msg("No files found in %s, skipping directory..." % (sourcedir))
        return

    corpusname = os.path.basename(os.path.normpath(sourcedir))
    targetcorpuspath = os.path.join(targetdir, corpusname)

    # FIXME: should we check for left-over .bak files here?

    # if the compact corpus does not exist, just create a new one
    if not corpus_exists_p(targetcorpuspath):
        msg("Compact corpus %s does not exist, creating new corpus" %
            (targetcorpuspath))
        create_archive(sourcedir,
                       targetdir,
                       force=1,
                       only_newer=only_newer,
                       remove=remove)
        return

    # only_newer afhandelen
    # als de corpusfile nieuwer is zijn we klaar
    targetIndexFile, targetDataFile = get_corpus_filenames(targetcorpuspath)
    if only_newer:
        if os.path.getmtime(targetDataFile) > os.path.getmtime(sourcedir):
            msg("corpus `%s' is up to date" % (datafile))
            return

    # tell the user we're actually doing something...
    msg("Updating %s with %s..." % (targetcorpuspath, sourcedir))

    # we want to create a temporary compact corpus that we'll
    # rename later
    #
    # - the filenames should be on the same file system as the target
    #   compact corpus
    #
    # - there's no way to have mktemp make a  .index and a .data.dz
    #   with the same basename
    #
    # - so we'll create a subdirectory within the target directory;
    #   this should be on the same filesystem

    # create a temporary directory
    tmpdir = tempfile.mkdtemp(dir=targetdir)

    # do a merge

    # create a temporary compact corpus
    tmpCorpusPath = os.path.join(tmpdir, corpusname)
    (tmpIndexFile, tmpDataFile) = get_corpus_filenames(tmpCorpusPath)
    writer = IndexedCorpusWriter(tmpIndexFile, tmpDataFile)

    # Get the filenames from the corpus we're supposed to be updating.
    (old_indexfile, old_datafile) = get_corpus_filenames(targetcorpuspath)
    oldcorpus = IndexedCorpusReader(old_indexfile, old_datafile)

    old_keys = oldcorpus.entries()
    sourcefiles = numsort.sorted_copy(sourcefiles)

    old_len = len(old_keys)
    update_len = len(sourcefiles)

    old_pos = 0
    update_pos = 0

    while (1):

        if update_pos >= update_len:
            # take the rest of the old corpus
            while old_pos < old_len:
                key = old_keys[old_pos]
                data = oldcorpus.data(key)
                writer.write(key, data)
                old_pos += 1

            # and we're done
            break

        if old_pos >= old_len:
            # take the rest of the new stuff
            while update_pos < update_len:
                name = os.path.basename(sourcefiles[update_pos])
                data = open(sourcefiles[update_pos]).read()
                writer.write(name, data)
                update_pos += 1

            # and we're done
            break

        update_key = os.path.basename(sourcefiles[update_pos])
        old_key = old_keys[old_pos]
        cmp_result = numsort.compare(update_key, old_key)

        if cmp_result == 0:
            # take the file from the update
            data = open(sourcefiles[update_pos]).read()
            writer.write(update_key, data)
            old_pos += 1
            update_pos += 1

        elif cmp_result < 0:
            # take the file from the update
            data = open(sourcefiles[update_pos]).read()
            writer.write(update_key, data)
            update_pos += 1

        else:
            # take the file from the old set
            data = oldcorpus.data(old_key)
            writer.write(old_key, data)
            old_pos += 1

    del writer

    # maak een backup van de bestaande corpus files
    for file in (targetIndexFile, targetDataFile):
        if verbose:
            msg('moving "%s" to "%s"' % (file, file + ".bak"))
        os.rename(file, file + ".bak")

    # verplaats de nieuwe corpus bestanden naar de juiste plek
    os.rename(tmpIndexFile, targetIndexFile)
    os.rename(tmpDataFile, targetDataFile)

    ## cleanup

    if verbose:
        msg("cleaning up...")

    # if we're here, the update was succesful; remove the .bak files

    # FIXME: there's still a slight chance things mess up on interrupt.

    os.remove(targetIndexFile + ".bak")
    os.remove(targetDataFile + ".bak")

    # rmdir should be enough now, tmpdir should not contain any files
    os.rmdir(tmpdir)

    if remove:
        for file in sourcefiles:
            os.remove(file)
Exemplo n.º 5
0
def create_archive(directory, targetdir, force=0, only_newer=0, remove=0):
    """Maak een archief (compact corpus) van DIRECTORY.

    Alle .xml bestanden in DIRECTORY komen in het archief terecht.

    Creeert in TARGETDIR de files DIRECTORY.index.en DIRECTORY.data.dz.

    When FORCE is not set, existing compact corpora result in a fatal error.

    When ONLY_NEWER is set, only creates a new compact corpus when the
    source directory is newer. 

    When REMOVE is set, the source files in DIRECTORY will be deleted
    afterwards
    """
    startdir = os.getcwd()

    sys.stderr.write("processing `%s'\n" % (directory))

    filelist = glob(os.path.join(directory, '*.xml'))

    if not filelist:
        msg("Error: no .xml files found in directory `%s', skipping directory..." \
              % (directory))
        return

    # de laatste component van de directory wordt de naam van het corpus
    corpus_name = os.path.basename(os.path.normpath(directory))

    indexfile, datafile = get_corpus_filenames(
        os.path.join(targetdir, corpus_name))

    if os.path.exists(datafile):
        if not force:
            msg("FATAL ERROR: refusing to overwrite `%s', use --force to override"
                % (datafile))
            sys.exit(1)

        if only_newer:
            if os.path.getmtime(directory) > os.path.getmtime(datafile):
                msg("creating `%s'..." % (datafile))
            else:
                msg("corpus `%s' is up to date" % (datafile))
                return
        else:
            msg("compact corpus `%s' exists.  overwriting..." % (datafile))

    filelist = numsort.sorted_copy(filelist)

    basePath = os.path.join(targetdir, corpus_name)
    corpuswriter = IndexedCorpusWriter("%s.index" % basePath,
                                       "%s.data.dz" % basePath)

    offset = 0
    for file in filelist:
        indexName = os.path.basename(file)
        data = open(file).read()

        if len(data.strip()) == 0:
            print >> sys.stderr, "Skipping empty file: %s" % file
            continue

        corpuswriter.write(indexName, data)

    if remove:
        for file in filelist:
            os.remove(file)
        # try to remove the directory too
        try:
            os.rmdir(directory)
        except:
            pass
Exemplo n.º 6
0
def review(directory=None,REQUEST=None):
    """Review the results"""
  
    if not directory:
        outputdir=open(CURRENT_OUTPUTDIR_PATH).read()   
        outputdir=outputdir.split('results\\')[1]
        REQUEST.response.redirect(REQUEST.URL + '/%s/' % outputdir)
        return
    #
    # Work out a path into a directory  
    #
    traverse_subpath = REQUEST.traverse_subpath
    path = os.path.join( *[RESULT_PATH, directory] + traverse_subpath)
    REQUEST['path']=path
    relative_path=os.path.join( * ([directory] + traverse_subpath) )
   
    # If correlation files need to be computed then
    # do it now
    if os.path.isdir(path):
        emmixgene.calculateCorrelationForDirectory(path)
    else:
        emmixgene.calculateCorrelationForDirectory(os.path.dirname(path))

    # if file is an image, let the image module 
    # handle it
    #
    extension=os.path.splitext(path.lower())[1]
    if extension in (".png",".svg"):
        return _heatmap(path, REQUEST)

    # if file is sstats, then use EmmixResult class
    # help in the display
    if path.lower().endswith('.sstats') or \
       path.lower().endswith('.stats'):
        return review_sstats(path, REQUEST)    
        
    if path.lower().find(".dat.cut_list") >= 0 :
        return review_list(path, REQUEST) 

    #
    # if file exists, set mime type and return the file
    # otherwise, return a directory listing
    #
    if os.path.isfile(path):
        REQUEST.response.setHeader("Mime-Type","text/html")
        body='<pre>%s</pre>' % open(path,"rb").read() 
        URL0=os.path.basename(path)
        return webutil.apply_template('review_sub.htm', locals(), globals())
    elif os.path.isdir(path): 
        if not REQUEST.URL.endswith('/'):
            REQUEST.response.setBase(REQUEST.URL+"/")
    else:
        # path is not a file and not a directory
        # try and make it a directory
        print "xxx making directory " , path, extension
        recurse_mkdir(path)

    # 
    # directory listing
    #
    files = os.listdir(path)
    import numsort
    files = numsort.sorted_copy(files)
    output = ""
    review_correlations = ""
    for file in files:

        if file.endswith(".cor") and not review_correlations:
            # allow people to review correlation heatmaps
            # if .cor files exist
            review_correlations = """<p>View correlation coefficients <a href="/review_corr_thumbnails?directory=%s">as thumbnails</a> </p>""" % relative_path
        description, show_heatmap = _file_info(file)
        if (description != "" and  show_heatmap != 0): 
            # generate heatmaps for these
            output += '''
                <a href="%(file)s">%(file)s</a> <b>%(description)s</b>
                <a href="%(file)s.png" title="png format">[heatmap]</a>
                <a href="%(file)s.svg" title="svg format">[heatmap]</a><br/>
                ''' % locals()
        elif (description !="" and  show_heatmap == 0):
            # only display the file
            output += '<a href="%s">%s</a> <b>%s</b> <br/>' % \
                    (file, file, description)

    files = os.listdir("results")
    other_directories = ""

    for file in files:

        dir_path = os.path.join("results",file)
        if os.path.isdir(dir_path):
            other_directories += '<a href="../%s/">%s</a><br/>\n' % (file, file)
    

    body = """

<h1>Review results</h1>

<h2>You are reviewing: %(path)s</h2>

<p>View heatmaps 
   <a href="/review_data_thumbnails?directory=%(relative_path)s">as thumbnails</a>
</p>

%(review_correlations)s

%(output)s

<p><a href="file://%(path)s" target="_blank">Open</a> the current results folder</p>
<hr/>

<H2>
Review results in other directories
</H2>

%(other_directories)s

<hr/>

<b>Note the following is a mock up only</b>
<h2>Logs</h2>

<font size="-1">
<TABLE>
    <TR>
        <TD>Data File
        </TD>
        <TD>Anon.dat
        </TD>
    </TR>
    <TR>
        <TD>Date run
        </TD>
        <TD>15/10/2001
        </TD>
    </TR>
    <TR>
        <TD>Researcher Notes
        </TD>
        <TD>ddd
        </TD>
    </TR>
    <TR>
        <TD>%(path)s
        </TD>
        <TD>
        </TD>
    </TR>
</TABLE>

</font>

<hr/>
""" % webutil._multimap(globals(), locals()) 
    return webutil.apply_template('review.htm', locals(), globals())