예제 #1
0
파일: load.py 프로젝트: apenwarr/ekb
def _calc_word_frequencies():
    print 'Deleting all wordweights'
    db.run('delete from WordWeights')
    db.run('delete from Words')
    
    totals = {}
    for doc in Doc.search():
        print ' %s' % doc.filename
        textbits = [doc.title, doc.title,  # title gets bonus points
                    doc.filename, doc.expanded_text(lambda x: x, headerdepth=1,
                                                    expandbooks=1)]
        textbits += doc.tags
        fulltext = join(' ', textbits)
        words = [w.lower() for w in re.findall(r"(\w+(?:[.'#%@]\w+)?)",
                                               fulltext)]
        total = len(words)*1.0
        wordcounts = {}
        echo('   %d total words' % total)
        for w in words:
            wordcounts[w] = wordcounts.get(w, 0) + 1
        echo(', %d unique' % len(wordcounts.keys()))
        new = 0
        for w,count in wordcounts.iteritems():
            if not w in totals:
                totals[w] = 0
                new += 1
            totals[w] += count
            db.run('insert into WordWeights (docid, word, weight) '
                   '  values (?,?,?)', doc.id, w, (count/total)**.5)
        echo(', %d new\n' % new)
    print ' %d total unique words' % len(totals)
    print 'Saving words'
    for word,count in totals.iteritems():
        db.run('insert into Words (word, total) values (?,?)', word, count)
예제 #2
0
파일: load.py 프로젝트: apenwarr/ekb
def _calc_related_matrix():
    print 'Deleting all relatedweights'
    db.run('delete from RelatedDocs')
    
    print 'Reading word weights'
    docs = list(Doc.search())
    docwords = {}
    for doc in docs:
        echo('.')
        l = docwords[doc] = {}
        for word,weight in db.run('select word,weight from WordWeights '
                                  '  where docid=?', doc.id):
            l[word] = weight
    print
    
    print 'Calculating related documents'
    correlations = {}
    for doc in docs:
        echo('.')
        l = correlations[doc] = {}
        for doc2 in docs:
            if doc2==doc: continue
            bits = (docwords[doc2].get(word,0)*weight
                      for word,weight in docwords[doc].iteritems())
            l[doc2] = sum(bits)
    print
    
    print 'Saving correlations'
    for doc in correlations:
        #print '%s:' % doc.filename
        for doc2,weight in correlations[doc].items():
            db.run('insert or replace into RelatedDocs '
                   '  (from_doc, to_doc, weight) '
                   '  values (?,?,?)', doc.id, doc2.id, weight)
예제 #3
0
파일: load.py 프로젝트: apenwarr/ekb
def _load_docs():
    seen = {}
    
    titlemap = {}
    for doc in Doc.search():
        if not os.path.exists(os.path.join(DOCDIR, doc.pathname)):
            print 'Deleting old document: %r %r' % (DOCDIR, doc.pathname)
            doc.delete()
        else:
            titlemap[doc.title] = doc
            
    print 'Loading all from "%s"' % DOCDIR
    for (dirpath, dirnames, filenames) in os.walk(DOCDIR):
        assert(dirpath.startswith(DOCDIR))
        for basename in filenames:
            fullpath = os.path.join(dirpath, basename)
            dirfile = fullpath[len(DOCDIR):]
            if (basename[-1] == '~' 
                or basename[0] == '.' or fullpath.find('/.') >= 0
                or basename=='Makefile'):
                   continue
            echo("  %s" % fullpath)

            if basename in seen:
                raise KeyError('Duplicate basename "%s"' % basename)
            seen[basename] = 1
                
            title = basename

            (title, tags, mtime, text) = parse_doc(dirfile)
            print " (tags=%s)" % repr(tags)

            while title in titlemap and titlemap[title].filename != basename:
                print ('WARNING: Duplicate title:\n  "%s"\n  "%s"'
                       % (basename, titlemap[title].filename))
                title += " [duplicate]"

            d = Doc.create(basename, dirfile, title)
            titlemap[title] = d
            d.use_latest()  # FIXME: lame: this parses a second time
            d.title = title
            d.save()