예제 #1
0
def parse(url, model=model):
    data = soupify.soupify(url)
    # mine your data here!
    model['url'] = url
    model['ID'] = soupify.getID(url)
    model['extrDate'] = datetime.date.today().isoformat()
    # check for redirect:
    if not data:
        model['error'] = 'Redirected'
        return model
    else:
        try:
            # format arguments en masse as {'model.key':['tag',{'attr':'value','attr':'value',...}],...}
            userData = {
                'user': ['a', {
                    "class": "orange nou"
                }],
                'name': ['span', {
                    "class": "fn"
                }],
                'url': ['a', {
                    'rel': 'nofollow me'
                }],
                'aim': ['a', {
                    'rel': 'me',
                    'class': 'url'
                }],
                'isPrinted': ['a', {
                    'href': '/alumniclub'
                }]
            }
            # process arguments en masse
            for k, v in userData.iteritems():
                model[k] = data.findAll(v[0], v[1])
                # if no data was returned, do nothing. Else, grab data contents.
                if model[k]:
                    model[k] = model[k][0].contents[0]
            # field-specific post-processing
            # convert isPrinted to boolean: if we gathered alumniclub data, user has been printed. Else, not.
            if model['isPrinted']:
                model['isPrinted'] = True
            else:
                model['isPrinted'] = False
            # use regex to parse unmarked string for data we want. send data as {'model.key':'regex',...}
            userData = data.findAll(
                'span', {"style": "font-size:12px;"})[0].contents[2]
            # 'data' will mark the group we need.
            userRegex = {
                'age': 'is a (?P<data>.*) year',
                'numDesSco': 'scored (?P<data>.*) submissions',
                'numScoPri': 'helping (?P<data>.*) designs',
                'avgSco': 'of (?P<data>.*), helping',
                'joinDate': 'since (?P<data>.*), has',
                'gender': '(?P<data>girl|boy)'
            }
            for k, v in userRegex.iteritems():
                v = re.search(v, userData)
                if v:
                    model[k] = v.group('data')
                else:
                    model[k] = u''
            # field-specific post-processing
            # eliminate commas, turning "1,000" into "1000"
            for i in ['numDesSco', 'numScoPri']:
                if model[i]:
                    model[i] = model[i].replace(',', '')
            # convert age into timedelta, and subtract it from today to get an approximate birth date
            if model['age']:
                model['age'] = (datetime.datetime.today() - datetime.timedelta(
                    float(model['age']) * 365.25)).isoformat()
            # convert joinDate to a datetime object
            if model['joinDate']:
                date = model['joinDate'].replace(',', '').split(' ')
                date[0] = soupify.month[date[0]]
                model['joinDate'] = datetime.date(int(date[2]), date[0],
                                                  int(date[1])).isoformat()
            # convert from unicode so the resultant csv isn't full of u''
            model['avgSco'] = float(model['avgSco'])
            for k in model.keys():
                if type(model[k]) is unicode:
                    model[k] = str(model[k].encode('ascii', 'ignore'))
            return model
        except:
            model['issue'] = 'Issues'
            print model['ID']
            raise
            return model
예제 #2
0
	def parse(self,url):
		# mine your data here!
		# get data. Anything that requires no post-processing is collected here.
		model = soupify.soupify(url,self.model)
		# get some easy values
		model['url'] = url
		model['ID'] = soupify.getID(url)
		model['extrDate'] = datetime.date.today().isoformat()
		# check for redirect:
		if model['ID'] not in model['url']:
			model['error'] = 'Redirected'
			return model
		else:
			# if no redirect, commence post-processing
			try:
				# convert isPrinted to boolean: if we gathered alumniclub data, user has been printed. Else, not.
				if model['isPrinted']:
					model['isPrinted'] = True
				else:
					model['isPrinted'] = False
				# recall that 'age' stores the html containing data for several other fields
				if model['age']:
					if len(model['age'][0].contents) > 1:
						userData = model['age'][0].contents[2]
					else:
						userData = model['age'][0].contents[0]
				else:
					model['issue'] = "Empty"
					print "Empty results at ID "+model['ID']
					return model
				# use regex to parse that html for data we want. send data as {'model.key':'regex',...}
				# group 'data' will mark the group we need.
				userRegex = {'age': 'is a (?P<data>.*) year',
					'numDesSco':'scored (?P<data>.*) submissions',
					'numScoPri':'helping (?P<data>.*) designs',
					'avgSco':'of (?P<data>.*), helping',
					'joinDate':'since (?P<data>.*), has',
					'gender':'(?P<data>girl|boy)'}
				for k,v in userRegex.iteritems():
					v = re.search(v,userData)
					if v:
						model[k] = v.group('data')
					else:
						model[k] = u''
				# format returned values
				# eliminate commas, turning "1,000" into "1000"
				for i in ['numDesSco','numScoPri']:
					if model[i]:
						model[i] = model[i].replace(',','')
				# convert age into timedelta, and subtract it from today to get an approximate birth date
				if model['age']:
					model['age'] = (datetime.datetime.today() - datetime.timedelta(float(model['age'])*365.25)).isoformat()
				# convert joinDate to a datetime object
				if model['joinDate']:
					date = model['joinDate'].replace(',','').split(' ')
					date[0] = soupify.month[date[0]]
					model['joinDate'] = datetime.date(int(date[2]),date[0],int(date[1])).isoformat()
				# convert from unicode so the resultant csv isn't full of u''
				model['avgSco'] = float(model['avgSco'])
				for k in model.keys():
					if type(model[k]) is unicode:
						model[k] = str(model[k].encode('ascii','ignore'))
				print "Successfully scraped ID "+model['ID']
				return model
			except:
				model['issue'] = 'Issues'
				print "Issues encountered at ID "+model['ID']
				raise
				return model
예제 #3
0
 def parse(self, url):
     # mine your data here!
     # get data. Anything that requires no post-processing is collected here.
     model = soupify.soupify(url, self.model)
     # get some easy values
     model['url'] = url
     model['ID'] = soupify.getID(url)
     model['extrDate'] = datetime.date.today().isoformat()
     # check for redirect:
     if model['ID'] not in model['url']:
         model['error'] = 'Redirected'
         return model
     else:
         # if no redirect, commence post-processing
         try:
             # convert isPrinted to boolean: if we gathered alumniclub data, user has been printed. Else, not.
             if model['isPrinted']:
                 model['isPrinted'] = True
             else:
                 model['isPrinted'] = False
             # recall that 'age' stores the html containing data for several other fields
             if model['age']:
                 if len(model['age'][0].contents) > 1:
                     userData = model['age'][0].contents[2]
                 else:
                     userData = model['age'][0].contents[0]
             else:
                 model['issue'] = "Empty"
                 print "Empty results at ID " + model['ID']
                 return model
             # use regex to parse that html for data we want. send data as {'model.key':'regex',...}
             # group 'data' will mark the group we need.
             userRegex = {
                 'age': 'is a (?P<data>.*) year',
                 'numDesSco': 'scored (?P<data>.*) submissions',
                 'numScoPri': 'helping (?P<data>.*) designs',
                 'avgSco': 'of (?P<data>.*), helping',
                 'joinDate': 'since (?P<data>.*), has',
                 'gender': '(?P<data>girl|boy)'
             }
             for k, v in userRegex.iteritems():
                 v = re.search(v, userData)
                 if v:
                     model[k] = v.group('data')
                 else:
                     model[k] = u''
             # format returned values
             # eliminate commas, turning "1,000" into "1000"
             for i in ['numDesSco', 'numScoPri']:
                 if model[i]:
                     model[i] = model[i].replace(',', '')
             # convert age into timedelta, and subtract it from today to get an approximate birth date
             if model['age']:
                 model['age'] = (
                     datetime.datetime.today() - datetime.timedelta(
                         float(model['age']) * 365.25)).isoformat()
             # convert joinDate to a datetime object
             if model['joinDate']:
                 date = model['joinDate'].replace(',', '').split(' ')
                 date[0] = soupify.month[date[0]]
                 model['joinDate'] = datetime.date(int(
                     date[2]), date[0], int(date[1])).isoformat()
             # convert from unicode so the resultant csv isn't full of u''
             model['avgSco'] = float(model['avgSco'])
             for k in model.keys():
                 if type(model[k]) is unicode:
                     model[k] = str(model[k].encode('ascii', 'ignore'))
             print "Successfully scraped ID " + model['ID']
             return model
         except:
             model['issue'] = 'Issues'
             print "Issues encountered at ID " + model['ID']
             raise
             return model
예제 #4
0
def parse(url,model=model):
	data = soupify.soupify(url)
	# mine your data here!
	model['url'] = url
	model['ID'] = soupify.getID(url)
	model['extrDate'] = datetime.date.today().isoformat()
	# check for redirect:
	if not data:
		model['error'] = 'Redirected'
		return model
	else:
		try:
			# format arguments en masse as {'model.key':['tag',{'attr':'value','attr':'value',...}],...}
			userData = {'user':['a',{"class":"orange nou"}],
				'name':['span',{"class":"fn"}],
				'url':['a',{'rel':'nofollow me'}],
				'aim':['a',{'rel':'me','class':'url'}],
				'isPrinted':['a',{'href':'/alumniclub'}]}
			# process arguments en masse
			for k,v in userData.iteritems():
				model[k] = data.findAll(v[0],v[1])
				# if no data was returned, do nothing. Else, grab data contents.
				if model[k]:
					model[k] = model[k][0].contents[0]
			# field-specific post-processing
			# convert isPrinted to boolean: if we gathered alumniclub data, user has been printed. Else, not.
			if model['isPrinted']:
				model['isPrinted'] = True
			else:
				model['isPrinted'] = False
			# use regex to parse unmarked string for data we want. send data as {'model.key':'regex',...}
			userData = data.findAll('span',{"style":"font-size:12px;"})[0].contents[2]
			# 'data' will mark the group we need.
			userRegex = {'age': 'is a (?P<data>.*) year',
				'numDesSco':'scored (?P<data>.*) submissions',
				'numScoPri':'helping (?P<data>.*) designs',
				'avgSco':'of (?P<data>.*), helping',
				'joinDate':'since (?P<data>.*), has',
				'gender':'(?P<data>girl|boy)'}
			for k,v in userRegex.iteritems():
				v = re.search(v,userData)
				if v:
					model[k] = v.group('data')
				else:
					model[k] = u''
			# field-specific post-processing
			# eliminate commas, turning "1,000" into "1000"
			for i in ['numDesSco','numScoPri']:
				if model[i]:
					model[i] = model[i].replace(',','')
			# convert age into timedelta, and subtract it from today to get an approximate birth date
			if model['age']:
				model['age'] = (datetime.datetime.today() - datetime.timedelta(float(model['age'])*365.25)).isoformat()
			# convert joinDate to a datetime object
			if model['joinDate']:
				date = model['joinDate'].replace(',','').split(' ')
				date[0] = soupify.month[date[0]]
				model['joinDate'] = datetime.date(int(date[2]),date[0],int(date[1])).isoformat()
			# convert from unicode so the resultant csv isn't full of u''
			model['avgSco'] = float(model['avgSco'])
			for k in model.keys():
				if type(model[k]) is unicode:
					model[k] = str(model[k].encode('ascii','ignore'))
			return model
		except:
			model['issue'] = 'Issues'
			print model['ID']
			raise
			return model