Пример #1
0
# create tag cloud
cloudsize = 40
maxfreq = sorteddict[0][0]
minfreq = sorteddict[cloudsize][0]
freqrange = maxfreq - minfreq
tempstring = ''
resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize])
for k in resorteddict:
    kfreq = k[0]
    klabel = dh.undecoratedHyperlink('#'+k[1], k[1])    
    scalingfactor = (kfreq - minfreq) / float(freqrange)
    tempstring += dh.scaledFontSizeSpan(klabel, scalingfactor)
outstring = dh.defaultCSSDiv(tempstring) + '<br />'
 
# create KWIC listings for each item
for k in resorteddict:
    klabel = k[1]
    tempstring = ''
    tempstring += '<a name=\"%s\">%s</a> ' % (klabel, klabel)
    tempstring += dh.undecoratedHyperlink('#', '[back]')
    outstring += dh.defaultCSSDiv(tempstring, opt='font-size : 24px;')
    outstring += '<p><pre>'
    for t in worddict[klabel]:
        outstring += dh.prettyPrintKWIC(t)
        outstring += '<br />'
    outstring += '</pre></p>'
 
# open in Firefox
dh.wrapStringInHTML("html-to-tag-cloud-kwic", url, outstring)#!/usr/bin/env python

# html-to-freq-4.py

import dh

# create sorted dictionary of word-frequency pairs
url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298'
# url = 'file:///C:/Documents%20and%20Settings/HP_Administrator/Desktop/ProgrammingHistorian/dcb-34298.html'
text = dh.webPageToText(url)
fullwordlist = dh.stripNonAlphaNum(text)
wordlist = dh.removeStopwords(fullwordlist, dh.stopwords)
dictionary = dh.wordListToFreqDict(wordlist)
sorteddict = dh.sortFreqDict(dictionary)

# create Google search link
keywords = []
for k in sorteddict[0:5]:
    keywords.append(str(k[1]))
gsearch = dh.keywordListToGoogleSearchLink(keywords, 'Google Search n=5')

# compile dictionary into string and wrap with HTML
outstring = gsearch + "<br /><br />"
for s in sorteddict:
    outstring += str(s)
    outstring += "<br />"
dh.wrapStringInHTML("html-to-freq-4", url, outstring)
        
# make directory to store downloaded pages if one doesn't exist
if os.path.exists('iroquois') == 0: os.mkdir('iroquois')

# download a local copy of each bio
urlprefix = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId='
for b in biodict:
    print "Processing bioid: " + str(b)
    url = urlprefix + str(b)
    outfile = 'iroquois/dcb-' + str(b) + '.html'
    if os.path.isfile(outfile) == 0:
        response = urllib2.urlopen(url)
        html = response.read()
        f = open(outfile, 'w')
        f.write(html)
        f.close    
        time.sleep(2)
    else:
        print "File already downloaded"
    sys.stdout.flush()

# create a page of links to local copies
outstring = ''
for b in biodict:
    outfile = 'dcb-' + str(b) + '.html'
    outstring += dh.undecoratedHyperlink('iroquois/'+outfile, str(b))
    outstring += '&nbsp;' * 4
    outstring += biodict[b]
    outstring += "<br />"
dh.wrapStringInHTML("get-iroquois-bios", searchresultfile, outstring)
# html-to-kwic-2.py

import dh

# create dictionary of n-grams
n = 7
url = 'file:///C:/Documents%20and%20Settings/HP_Administrator/Desktop/ProgrammingHistorian/dcb-34298.html'
# url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298'
text = dh.webPageToText(url)
fullwordlist = ('# ' * (n//2)).split()
fullwordlist += dh.stripNonAlphaNum(text)
fullwordlist += ('# ' * (n//2)).split()
ngrams = dh.getNGrams(fullwordlist, n)
worddict = dh.nGramsToKWICDict(ngrams)

# output KWIC and wrap with HTML
target = 'iroquois'
outstr = '<pre>'
if worddict.has_key(target):
    for k in worddict[target]:
        linkname = dh.prettyPrintKWIC(k)
        keywords = dh.removeStopwords(k, dh.stopwords)
        outstr += dh.keywordListToGoogleSearchLink(keywords, linkname)
        # outstr += '<br />'
else:
    outstr += 'Keyword not found in source'
outstr += '</pre>'
dh.wrapStringInHTML('html-to-kwic-2', url, outstr)
# create tag cloud
cloudsize = 40
maxfreq = sorteddict[0][0]
minfreq = sorteddict[cloudsize][0]
freqrange = maxfreq - minfreq
tempstring = ''
resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize])
for k in resorteddict:
    kfreq = k[0]
    klabel = dh.undecoratedHyperlink('#'+k[1], k[1])    
    scalingfactor = (kfreq - minfreq) / float(freqrange)
    tempstring += dh.scaledFontSizeSpan(klabel, scalingfactor)
outstring = dh.defaultCSSDiv(tempstring) + '<br />'

# create KWIC listings for each item
for k in resorteddict:
    klabel = k[1]
    tempstring = ''
    tempstring += '<a name=\"%s\">%s</a> ' % (klabel, klabel)
    tempstring += dh.undecoratedHyperlink('#', '[back]')
    outstring += dh.defaultCSSDiv(tempstring, opt='font-size : 24px;')
    outstring += '<p><pre>'
    for t in worddict[klabel]:
        outstring += dh.prettyPrintKWIC(t)
        outstring += '<br />'
    outstring += '</pre></p>'

# open in Firefox
dh.wrapStringInHTML("html-to-tag-cloud-kwic", url, outstring)
    
# html-to-tag-cloud.py

import dh

# create sorted dictionary of word-frequency pairs
# url = 'http://www.biographi.ca/EN/ShowBioPrintable.asp?BioId=34298'
url = "file:///C:/Documents%20and%20Settings/HP_Administrator/Desktop/ProgrammingHistorian/dcb-34298.html"
text = dh.webPageToText(url)
fullwordlist = dh.stripNonAlphaNum(text)
wordlist = dh.removeStopwords(fullwordlist, dh.stopwords)
dictionary = dh.wordListToFreqDict(wordlist)
sorteddict = dh.sortFreqDict(dictionary)

# create tag cloud and open in Firefox
cloudsize = 100
maxfreq = sorteddict[0][0]
minfreq = sorteddict[cloudsize][0]
freqrange = maxfreq - minfreq
outstring = ""
resorteddict = dh.reSortFreqDictAlpha(sorteddict[:cloudsize])
for k in resorteddict:
    kfreq = k[0]
    klabel = k[1]
    scalingfactor = (kfreq - minfreq) / float(freqrange)
    outstring += " " + dh.scaledFontHeatmapSpan(klabel, scalingfactor) + " "
dh.wrapStringInHTML("html-to-tag-cloud", url, dh.defaultCSSDiv(outstring))