def scrapeCaps(soup, date): # Given a BeautifulSoup object, parse it for valid captions # Returns a list of all valid captions for further work captions = [] missed = False ppic = re.compile('/i/partypictures/') imgs = soup.findAll('img') for img in imgs: attrs = img.attrs for attr in attrs: if attr[0] == 'src': attr[1] src = attr[1] # if src=/i/partypics if ppic.match(src): try: ## crawl up to /table level tst = img tab = False while not tab: name = tst.name if name != 'table': tst = tst.parent else: tab = tst # a handful of images aren't in named elements. This skips them except AttributeError: break # work through table's children to find text tds = tab.findChildren() Ntd = 0 cap = '' for ind, td in enumerate(tds): if td.text: if Ntd == 0: Ntd += 1 first = td.text cap = cap + td.text # if there's more than one text field (often a repeat) else: if td.text != cap: # might be a substring if td.text not in cap: tcap = cap + td.text num = indNum.search(tcap) # substrings usually have CAPTION2, so split at \d if num: cap = tcap[ num.end():] # too much work to split these, only lose around 15 else: missed = True # append if a cap was found if cap: captions.append(cap) ## ^^ ABOVE COVERS VAST MAJORITY OF CASES ^^ # A handful of pages have the captions in alternate tables. This tries another method if len(captions) == 0: switchdate = dt.date( 2007, 9, 4 ) ## later, easier method of finding captions if date > switchdate: print " >> DIV HUNT << " divs = soup.findAll('div', 'photocaption') for div in divs : conts = div.contents # if this is a caption, parse it if conts: # make sure this is a string, or parseCaption crashes try: if conts[0].string: captions.append(conts[0].string) except TypeError: print "that wasn't a NavigableString" ## trying a hunt for td labelled with photocaption if len(captions) == 0: print " >> EARLY TIMES << " tds = soup.findAll('td', 'photocaption') for td in tds : conts = td.contents # if this is a caption, parse it if conts: # make sure this is a string, or parseCaption crashes try: if conts[0].string: captions.append(conts[0].string) except TypeError: print "that wasn't a NavigableString" ## final warning if nothing found if len(captions)==0: print " NONE FOUND" dan.danpause() return captions, missed
def parseCaption(caption): # given a caption, return a list of names of people in the caption # if no names found, or caption not about people, return None debug = False flag = False rejects = [] if debug: print " " print caption names = [] chunks = [] caplen = len(caption) if (caplen>1) & (caplen<250) : # ignore short/long descriptions ## split on 'AT' and 'IN' # throw away everything after "so-and-so AT the ball" if findAt.search(caption): rejects.append( caption[ findAt.search(caption).start(): ]) caption = caption[ :findAt.search(caption).start() ].strip() if ' in ' in caption: tmp = caption.split(' in ') rejects.append( tmp[1] ) caption = tmp[0] if debug: print "After at/in : ", caption ### skip if less than 4 words if len( caption.split() ) < 4: if debug: print " nWords < 4" return None # each chunk is a set of words chunks = fpunkt.split(caption) ### fix Jr. / Sr. / Dr. for ind, chunk in enumerate(chunks): if fsr.search(chunk): print chunk spl = fjr.split(chunk) chunk = ''.join(spl).strip() print chunk for ind, chunk in enumerate(chunks): if fjr.search(chunk): # print print chunk spl = fjr.split(chunk) chunk = ''.join(spl).strip() print chunk for ind, chunk in enumerate(chunks): if fdr.search(chunk): print print "Dr. Sub" print chunk spl = fdr.split(chunk) chunks[ind] = ''.join(spl).strip() print chunks ## ^^ only the Dr. one works, because of Python's scoping. ## Could fix the others if needed if debug: print chunks ### split at "Bob WITH Kate" for ind,chunk in enumerate(chunks): # if 'with' in chunk: if fwith.search(chunk): pieces = chunk.split(' with ') chunks[ind] = pieces[0] chunks.insert(ind+1, pieces[1]) ### handling 'AND' ### # checks first word of split (implies a list, and Bob) for ind,chunk in enumerate(chunks): if fand.match(chunk): # this re. defined above chunks[ind] = chunks[ind][5:] # everything after the "and " chunk = chunks[ind] #print "post-fand chunks : ", chunks # separating "Husb and Wife Smith", etc if ' and ' in chunk: #print "found AND " temp = chunk.split(' and ') name1 = temp[0].strip() name2 = temp[1].strip() human1 = parser.HumanName(name1) human2 = parser.HumanName(name2) if debug: print "ind = ", ind, "; temp = ", temp print "name1 = ", name1, "; name2 = ", name2 print "human1 = ", human1 print "human2 = ", human2 # if this was of the form "Husband and Wife Smith" if not human1.last : human1 = temp[0].strip() + ' ' + human2.last else : human1 = name1 chunks[ind]= human1 chunks.insert(ind+1, temp[-1]) ### check for capitalized words to see if this is names ### cutList = [] for ind, chunk in enumerate(chunks) : words = chunk.split() nWords = len(words) if nWords: # check ratio of caps to not nCaps = sum(map(str.isupper, str(chunk))) # ratio = float(nCaps)/nWords if (nWords-nCaps)>1 : cutList.append( chunk ) if debug: print "no caps" rejects.append(cutList) for cut in cutList: chunks.remove( cut ) if debug: print print "after capitals :" print " reject : ", rejects print " keep :", chunks dan.danpause() ### cut chunks with 'The' cutList = [] for chunk in chunks: if fthe.search( chunk ): # print chunks # should probably check if it's already there if chunk not in cutList: cutList.append( chunk ) rejects.append(cutList) for cut in cutList: chunks.remove( cut ) if debug: print ' cutting "the" : ', chunks ### upon exit ### if len(chunks)>1: # need more than one person for chunk in chunks: chunk = chunk.strip() if len(chunk.split())>1: # make sure it doesn't say "friend" if not ffrnd.search(chunk): # strip whitespace, condense multispaces names.append( re.sub('\s+',' ',chunk.strip() ) ) else : print "rej : ", chunk rejects.append(chunk) # print names return names else: return None
def scrapeCaps(soup, date): # Given a BeautifulSoup object, parse it for valid captions # Returns a list of all valid captions for further work captions = [] missed = False ppic = re.compile('/i/partypictures/') imgs = soup.findAll('img') for img in imgs: attrs = img.attrs for attr in attrs: if attr[0] == 'src': attr[1] src = attr[1] # if src=/i/partypics if ppic.match(src): try: ## crawl up to /table level tst = img tab = False while not tab: name = tst.name if name != 'table': tst = tst.parent else: tab = tst # a handful of images aren't in named elements. This skips them except AttributeError: break # work through table's children to find text tds = tab.findChildren() Ntd = 0 cap = '' for ind, td in enumerate(tds): if td.text: if Ntd == 0: Ntd += 1 first = td.text cap = cap + td.text # if there's more than one text field (often a repeat) else: if td.text != cap: # might be a substring if td.text not in cap: tcap = cap + td.text num = indNum.search(tcap) # substrings usually have CAPTION2, so split at \d if num: cap = tcap[num.end():] # too much work to split these, only lose around 15 else: missed = True # append if a cap was found if cap: captions.append(cap) ## ^^ ABOVE COVERS VAST MAJORITY OF CASES ^^ # A handful of pages have the captions in alternate tables. This tries another method if len(captions) == 0: switchdate = dt.date(2007, 9, 4) ## later, easier method of finding captions if date > switchdate: print " >> DIV HUNT << " divs = soup.findAll('div', 'photocaption') for div in divs: conts = div.contents # if this is a caption, parse it if conts: # make sure this is a string, or parseCaption crashes try: if conts[0].string: captions.append(conts[0].string) except TypeError: print "that wasn't a NavigableString" ## trying a hunt for td labelled with photocaption if len(captions) == 0: print " >> EARLY TIMES << " tds = soup.findAll('td', 'photocaption') for td in tds: conts = td.contents # if this is a caption, parse it if conts: # make sure this is a string, or parseCaption crashes try: if conts[0].string: captions.append(conts[0].string) except TypeError: print "that wasn't a NavigableString" ## final warning if nothing found if len(captions) == 0: print " NONE FOUND" dan.danpause() return captions, missed
def parseCaption(caption): # given a caption, return a list of names of people in the caption # if no names found, or caption not about people, return None debug = False flag = False rejects = [] if debug: print " " print caption names = [] chunks = [] caplen = len(caption) if (caplen > 1) & (caplen < 250): # ignore short/long descriptions ## split on 'AT' and 'IN' # throw away everything after "so-and-so AT the ball" if findAt.search(caption): rejects.append(caption[findAt.search(caption).start():]) caption = caption[:findAt.search(caption).start()].strip() if ' in ' in caption: tmp = caption.split(' in ') rejects.append(tmp[1]) caption = tmp[0] if debug: print "After at/in : ", caption ### skip if less than 4 words if len(caption.split()) < 4: if debug: print " nWords < 4" return None # each chunk is a set of words chunks = fpunkt.split(caption) ### fix Jr. / Sr. / Dr. for ind, chunk in enumerate(chunks): if fsr.search(chunk): print chunk spl = fjr.split(chunk) chunk = ''.join(spl).strip() print chunk for ind, chunk in enumerate(chunks): if fjr.search(chunk): # print print chunk spl = fjr.split(chunk) chunk = ''.join(spl).strip() print chunk for ind, chunk in enumerate(chunks): if fdr.search(chunk): print print "Dr. Sub" print chunk spl = fdr.split(chunk) chunks[ind] = ''.join(spl).strip() print chunks ## ^^ only the Dr. one works, because of Python's scoping. ## Could fix the others if needed if debug: print chunks ### split at "Bob WITH Kate" for ind, chunk in enumerate(chunks): # if 'with' in chunk: if fwith.search(chunk): pieces = chunk.split(' with ') chunks[ind] = pieces[0] chunks.insert(ind + 1, pieces[1]) ### handling 'AND' ### # checks first word of split (implies a list, and Bob) for ind, chunk in enumerate(chunks): if fand.match(chunk): # this re. defined above chunks[ind] = chunks[ind][5:] # everything after the "and " chunk = chunks[ind] #print "post-fand chunks : ", chunks # separating "Husb and Wife Smith", etc if ' and ' in chunk: #print "found AND " temp = chunk.split(' and ') name1 = temp[0].strip() name2 = temp[1].strip() human1 = parser.HumanName(name1) human2 = parser.HumanName(name2) if debug: print "ind = ", ind, "; temp = ", temp print "name1 = ", name1, "; name2 = ", name2 print "human1 = ", human1 print "human2 = ", human2 # if this was of the form "Husband and Wife Smith" if not human1.last: human1 = temp[0].strip() + ' ' + human2.last else: human1 = name1 chunks[ind] = human1 chunks.insert(ind + 1, temp[-1]) ### check for capitalized words to see if this is names ### cutList = [] for ind, chunk in enumerate(chunks): words = chunk.split() nWords = len(words) if nWords: # check ratio of caps to not nCaps = sum(map(str.isupper, str(chunk))) # ratio = float(nCaps)/nWords if (nWords - nCaps) > 1: cutList.append(chunk) if debug: print "no caps" rejects.append(cutList) for cut in cutList: chunks.remove(cut) if debug: print print "after capitals :" print " reject : ", rejects print " keep :", chunks dan.danpause() ### cut chunks with 'The' cutList = [] for chunk in chunks: if fthe.search(chunk): # print chunks # should probably check if it's already there if chunk not in cutList: cutList.append(chunk) rejects.append(cutList) for cut in cutList: chunks.remove(cut) if debug: print ' cutting "the" : ', chunks ### upon exit ### if len(chunks) > 1: # need more than one person for chunk in chunks: chunk = chunk.strip() if len(chunk.split()) > 1: # make sure it doesn't say "friend" if not ffrnd.search(chunk): # strip whitespace, condense multispaces names.append(re.sub('\s+', ' ', chunk.strip())) else: print "rej : ", chunk rejects.append(chunk) # print names return names else: return None
from data group by camis ) group by zipcode having ncami > 100 ''' # len 208 # order by count(camis) desc cur.execute(q1) zgroup = cur.fetchall() print " Parsing zipcode results : " parseZgroup(zgroup) print danpause() ##################################################################### if True: q2 = ''' select boroname, count(camis), sum(score), sum(score*score) from ( select boroname, camis, score, max(inspdate) from data join boro on boro.bid = data.boro group by camis )
from data group by camis ) group by zipcode having ncami > 100 ''' # len 208 # order by count(camis) desc cur.execute(q1) zgroup = cur.fetchall() print " Parsing zipcode results : " parseZgroup(zgroup) print danpause() ##################################################################### if True: q2 = ''' select boroname, count(camis), sum(score), sum(score*score) from ( select boroname, camis, score, max(inspdate) from data join boro on boro.bid = data.boro group by camis ) group by boroname