Пример #1
0
def scrapeCaps(soup, date):
# Given a BeautifulSoup object, parse it for valid captions 
# Returns a list of all valid captions for further work

	captions = []
	missed = False
	ppic = re.compile('/i/partypictures/')

	imgs = soup.findAll('img')
	for img in imgs:
		attrs = img.attrs

		for attr in attrs:
			if attr[0] == 'src':
				attr[1]
				src = attr[1]

		# if src=/i/partypics
		if ppic.match(src):

			try:
				## crawl up to /table level
				tst = img
				tab = False
				while not tab:
					name = tst.name
					if name != 'table':
						tst = tst.parent
					else:
						tab = tst
			# a handful of images aren't in named elements.  This skips them
			except AttributeError:
				break

			# work through table's children to find text
			tds = tab.findChildren()
			Ntd = 0
			cap = ''
			
			for ind, td in enumerate(tds):
				if td.text:

					if Ntd == 0:
						Ntd += 1
						first = td.text
						cap = cap + td.text

					# if there's more than one text field (often a repeat)
					else:
						if td.text != cap:

							# might be a substring
							if td.text not in cap:
								tcap = cap + td.text
								num = indNum.search(tcap)

								# substrings usually have CAPTION2, so split at \d
								if num:
									cap = tcap[ num.end():]

								# too much work to split these, only lose around 15
								else:
									missed = True

			# append if a cap was found
			if cap:
				captions.append(cap)

	## ^^ ABOVE COVERS VAST MAJORITY OF CASES ^^

	#  A handful of pages have the captions in alternate tables.  This tries another method
	if len(captions) == 0:

		switchdate = dt.date( 2007, 9, 4 )

		##  later, easier method of finding captions
		if date > switchdate:
			print " >> DIV HUNT << "

			divs = soup.findAll('div', 'photocaption')
			for div in divs :

				conts = div.contents
				# if this is a caption, parse it
				if conts:
					# make sure this is a string, or parseCaption crashes
					try:
						if conts[0].string:
							captions.append(conts[0].string)
					except TypeError:
						print "that wasn't a NavigableString"

		## trying a hunt for td labelled with photocaption
		if len(captions) == 0:
			print "  >> EARLY TIMES << "
			tds = soup.findAll('td', 'photocaption')
			for td in tds :

				conts = td.contents
				# if this is a caption, parse it
				if conts:
					# make sure this is a string, or parseCaption crashes
					try:
						if conts[0].string:
							captions.append(conts[0].string)
					except TypeError:
						print "that wasn't a NavigableString"


	## final warning if nothing found
	if len(captions)==0:
		print " NONE FOUND"
		dan.danpause()

	return captions, missed
Пример #2
0
def parseCaption(caption):
# given a caption, return a list of names of people in the caption
# if no names found, or caption not about people, return None
	debug = False
	flag = False
	rejects = []

	if debug:
		print " "
		print caption

	names = []
	chunks = []

	caplen = len(caption)
	if (caplen>1) & (caplen<250) :	# ignore short/long descriptions


		## split on 'AT' and 'IN'
		# throw away everything after "so-and-so AT the ball"
		if findAt.search(caption):
			rejects.append( caption[ findAt.search(caption).start(): ])
			caption = caption[ :findAt.search(caption).start() ].strip()
		if ' in ' in caption:
			tmp = caption.split(' in ')
			rejects.append( tmp[1] )
			caption = tmp[0]
		if debug:
			print "After at/in : ", caption



		### skip if less than 4 words
		if len( caption.split() ) < 4:
			if debug:
				print " nWords < 4"
			return None


		# each chunk is a set of words
		chunks = fpunkt.split(caption)


		###  fix Jr. / Sr. / Dr.
		for ind, chunk in enumerate(chunks):
			if fsr.search(chunk):
				print chunk
				spl = fjr.split(chunk)
				chunk = ''.join(spl).strip()
				print chunk

		for ind, chunk in enumerate(chunks):
			if fjr.search(chunk):
	#			print
				print chunk
				spl = fjr.split(chunk)
				chunk = ''.join(spl).strip()
				print chunk

		for ind, chunk in enumerate(chunks):
			if fdr.search(chunk):
				print
				print "Dr. Sub"
				print chunk
				spl = fdr.split(chunk)
				chunks[ind] = ''.join(spl).strip()
				print chunks
		## ^^ 	only the Dr. one works, because of Python's scoping.
		## 		Could fix the others if needed


		if debug:
			print chunks



		### split at "Bob WITH Kate"
		for ind,chunk in enumerate(chunks):
#				if 'with' in chunk:
			if fwith.search(chunk):
				pieces = chunk.split(' with ')
				chunks[ind] = pieces[0]
				chunks.insert(ind+1, pieces[1])





		### handling 'AND' ###

		# checks first word of split (implies a list, and Bob)
		for ind,chunk in enumerate(chunks):
			if fand.match(chunk):	# this re. defined above
				chunks[ind] = chunks[ind][5:]	# everything after the "and "
				chunk = chunks[ind]
				#print "post-fand chunks : ", chunks

			# separating "Husb and Wife Smith", etc
			if ' and ' in chunk:
				#print "found AND "
				temp = chunk.split(' and ')
				name1 = temp[0].strip()
				name2 = temp[1].strip()
				human1 = parser.HumanName(name1)
				human2 = parser.HumanName(name2)

				if debug:
					print "ind = ", ind, "; temp = ", temp
					print "name1 = ", name1, ";  name2 = ", name2
					print "human1 = ", human1
					print "human2 = ", human2

				# if this was of the form "Husband and Wife Smith"
				if not human1.last :
					human1 = temp[0].strip() + ' ' + human2.last
				else :
					human1 = name1

				chunks[ind]= human1
				chunks.insert(ind+1, temp[-1])




		### check for capitalized words to see if this is names ###
		cutList = []
		for ind, chunk in enumerate(chunks) :
			words = chunk.split()
			nWords = len(words)

			if nWords:
				# check ratio of caps to not
				nCaps = sum(map(str.isupper, str(chunk)))
#				ratio = float(nCaps)/nWords

				if (nWords-nCaps)>1 :
					cutList.append( chunk )
					if debug:
						print "no caps"

		rejects.append(cutList)
		for cut in cutList:
			chunks.remove( cut )

		if debug:
			print
			print "after capitals :"
			print "  reject : ", rejects
			print "  keep :", chunks
			dan.danpause()




		### cut chunks with 'The'
		cutList = []
		for chunk in chunks:
			if fthe.search( chunk ):
		#		print chunks
				# should probably check if it's already there
				if chunk not in cutList:
					cutList.append( chunk )

		rejects.append(cutList)
		for cut in cutList:
			chunks.remove( cut )

		if debug:
			print ' cutting "the" : ', chunks




		### upon exit ###
		if len(chunks)>1:		# need more than one person
			for chunk in chunks:
				chunk = chunk.strip()
				if len(chunk.split())>1:
					# make sure it doesn't say "friend"
					if not ffrnd.search(chunk):
						# strip whitespace, condense multispaces
						names.append( re.sub('\s+',' ',chunk.strip() ) )
					else :
						print "rej : ", chunk
						rejects.append(chunk)


	#	print names
		return names

	else:
		return None
Пример #3
0
def scrapeCaps(soup, date):
    # Given a BeautifulSoup object, parse it for valid captions
    # Returns a list of all valid captions for further work

    captions = []
    missed = False
    ppic = re.compile('/i/partypictures/')

    imgs = soup.findAll('img')
    for img in imgs:
        attrs = img.attrs

        for attr in attrs:
            if attr[0] == 'src':
                attr[1]
                src = attr[1]

        # if src=/i/partypics
        if ppic.match(src):

            try:
                ## crawl up to /table level
                tst = img
                tab = False
                while not tab:
                    name = tst.name
                    if name != 'table':
                        tst = tst.parent
                    else:
                        tab = tst
            # a handful of images aren't in named elements.  This skips them
            except AttributeError:
                break

            # work through table's children to find text
            tds = tab.findChildren()
            Ntd = 0
            cap = ''

            for ind, td in enumerate(tds):
                if td.text:

                    if Ntd == 0:
                        Ntd += 1
                        first = td.text
                        cap = cap + td.text

                    # if there's more than one text field (often a repeat)
                    else:
                        if td.text != cap:

                            # might be a substring
                            if td.text not in cap:
                                tcap = cap + td.text
                                num = indNum.search(tcap)

                                # substrings usually have CAPTION2, so split at \d
                                if num:
                                    cap = tcap[num.end():]

                                # too much work to split these, only lose around 15
                                else:
                                    missed = True

            # append if a cap was found
            if cap:
                captions.append(cap)

    ## ^^ ABOVE COVERS VAST MAJORITY OF CASES ^^

    #  A handful of pages have the captions in alternate tables.  This tries another method
    if len(captions) == 0:

        switchdate = dt.date(2007, 9, 4)

        ##  later, easier method of finding captions
        if date > switchdate:
            print " >> DIV HUNT << "

            divs = soup.findAll('div', 'photocaption')
            for div in divs:

                conts = div.contents
                # if this is a caption, parse it
                if conts:
                    # make sure this is a string, or parseCaption crashes
                    try:
                        if conts[0].string:
                            captions.append(conts[0].string)
                    except TypeError:
                        print "that wasn't a NavigableString"

        ## trying a hunt for td labelled with photocaption
        if len(captions) == 0:
            print "  >> EARLY TIMES << "
            tds = soup.findAll('td', 'photocaption')
            for td in tds:

                conts = td.contents
                # if this is a caption, parse it
                if conts:
                    # make sure this is a string, or parseCaption crashes
                    try:
                        if conts[0].string:
                            captions.append(conts[0].string)
                    except TypeError:
                        print "that wasn't a NavigableString"

    ## final warning if nothing found
    if len(captions) == 0:
        print " NONE FOUND"
        dan.danpause()

    return captions, missed
Пример #4
0
def parseCaption(caption):
    # given a caption, return a list of names of people in the caption
    # if no names found, or caption not about people, return None
    debug = False
    flag = False
    rejects = []

    if debug:
        print " "
        print caption

    names = []
    chunks = []

    caplen = len(caption)
    if (caplen > 1) & (caplen < 250):  # ignore short/long descriptions

        ## split on 'AT' and 'IN'
        # throw away everything after "so-and-so AT the ball"
        if findAt.search(caption):
            rejects.append(caption[findAt.search(caption).start():])
            caption = caption[:findAt.search(caption).start()].strip()
        if ' in ' in caption:
            tmp = caption.split(' in ')
            rejects.append(tmp[1])
            caption = tmp[0]
        if debug:
            print "After at/in : ", caption

        ### skip if less than 4 words
        if len(caption.split()) < 4:
            if debug:
                print " nWords < 4"
            return None

        # each chunk is a set of words
        chunks = fpunkt.split(caption)

        ###  fix Jr. / Sr. / Dr.
        for ind, chunk in enumerate(chunks):
            if fsr.search(chunk):
                print chunk
                spl = fjr.split(chunk)
                chunk = ''.join(spl).strip()
                print chunk

        for ind, chunk in enumerate(chunks):
            if fjr.search(chunk):
                #			print
                print chunk
                spl = fjr.split(chunk)
                chunk = ''.join(spl).strip()
                print chunk

        for ind, chunk in enumerate(chunks):
            if fdr.search(chunk):
                print
                print "Dr. Sub"
                print chunk
                spl = fdr.split(chunk)
                chunks[ind] = ''.join(spl).strip()
                print chunks
        ## ^^ 	only the Dr. one works, because of Python's scoping.
        ## 		Could fix the others if needed

        if debug:
            print chunks

        ### split at "Bob WITH Kate"
        for ind, chunk in enumerate(chunks):
            #				if 'with' in chunk:
            if fwith.search(chunk):
                pieces = chunk.split(' with ')
                chunks[ind] = pieces[0]
                chunks.insert(ind + 1, pieces[1])

        ### handling 'AND' ###

        # checks first word of split (implies a list, and Bob)
        for ind, chunk in enumerate(chunks):
            if fand.match(chunk):  # this re. defined above
                chunks[ind] = chunks[ind][5:]  # everything after the "and "
                chunk = chunks[ind]
                #print "post-fand chunks : ", chunks

            # separating "Husb and Wife Smith", etc
            if ' and ' in chunk:
                #print "found AND "
                temp = chunk.split(' and ')
                name1 = temp[0].strip()
                name2 = temp[1].strip()
                human1 = parser.HumanName(name1)
                human2 = parser.HumanName(name2)

                if debug:
                    print "ind = ", ind, "; temp = ", temp
                    print "name1 = ", name1, ";  name2 = ", name2
                    print "human1 = ", human1
                    print "human2 = ", human2

                # if this was of the form "Husband and Wife Smith"
                if not human1.last:
                    human1 = temp[0].strip() + ' ' + human2.last
                else:
                    human1 = name1

                chunks[ind] = human1
                chunks.insert(ind + 1, temp[-1])

        ### check for capitalized words to see if this is names ###
        cutList = []
        for ind, chunk in enumerate(chunks):
            words = chunk.split()
            nWords = len(words)

            if nWords:
                # check ratio of caps to not
                nCaps = sum(map(str.isupper, str(chunk)))
                #				ratio = float(nCaps)/nWords

                if (nWords - nCaps) > 1:
                    cutList.append(chunk)
                    if debug:
                        print "no caps"

        rejects.append(cutList)
        for cut in cutList:
            chunks.remove(cut)

        if debug:
            print
            print "after capitals :"
            print "  reject : ", rejects
            print "  keep :", chunks
            dan.danpause()

        ### cut chunks with 'The'
        cutList = []
        for chunk in chunks:
            if fthe.search(chunk):
                #		print chunks
                # should probably check if it's already there
                if chunk not in cutList:
                    cutList.append(chunk)

        rejects.append(cutList)
        for cut in cutList:
            chunks.remove(cut)

        if debug:
            print ' cutting "the" : ', chunks

        ### upon exit ###
        if len(chunks) > 1:  # need more than one person
            for chunk in chunks:
                chunk = chunk.strip()
                if len(chunk.split()) > 1:
                    # make sure it doesn't say "friend"
                    if not ffrnd.search(chunk):
                        # strip whitespace, condense multispaces
                        names.append(re.sub('\s+', ' ', chunk.strip()))
                    else:
                        print "rej : ", chunk
                        rejects.append(chunk)

    #	print names
        return names

    else:
        return None
Пример #5
0
			from data
			group by camis
			)
		group by zipcode
		having ncami > 100
	'''
	# len 208
	# order by count(camis) desc

	cur.execute(q1)
	zgroup = cur.fetchall()

	print " Parsing zipcode results : "
	parseZgroup(zgroup)
	print
	danpause()

		
#####################################################################

if True:
	q2 = '''
	select boroname, count(camis), sum(score), sum(score*score)
	from 
		(
		select boroname, camis, score, max(inspdate)
			from data
				join boro
				on boro.bid = data.boro
				group by camis
		)
Пример #6
0
			from data
			group by camis
			)
		group by zipcode
		having ncami > 100
	'''
    # len 208
    # order by count(camis) desc

    cur.execute(q1)
    zgroup = cur.fetchall()

    print " Parsing zipcode results : "
    parseZgroup(zgroup)
    print
    danpause()

#####################################################################

if True:
    q2 = '''
	select boroname, count(camis), sum(score), sum(score*score)
	from 
		(
		select boroname, camis, score, max(inspdate)
			from data
				join boro
				on boro.bid = data.boro
				group by camis
		)
		group by boroname