def callback_details(data, **kwargs): if type(data) != type([]): raise Exception, '::callback_allhit() must be called with one list argument' errors = [] # Processing each record for i in range(0, len(data)): if data[i]['HitGroupStatus']['hit_group_content'].html != '': continue group_id = data[i]['HitGroupStatus']['group_id'] if not data[i]['HitGroupStatus']['hit_group_content'].group_id_hashed: try: logging.info("Downloading group details for: %s" % group_id) html = None # Downloading group details preview_html = urllib2.urlopen(get_group_url(group_id)).read() # Seeking for an iframe. iframe_url = re.search(re.compile(r"<iframe.*?src=\"(.*?)\""), preview_html) # Fetching iframe source if there is iframe in the html. Otherwise, the # the html must be already here in the <div id="hit-wrapper ... if iframe_url: html = urllib2.urlopen(iframe_url.group(1)).read() else: html = str(BeautifulSoup(preview_html).find('div', {'id':'hit-wrapper'})) if html: data[i]['HitGroupStatus']['hit_group_content'].html = html preview_html = None except: logging.error("Failed to process group details for %s (%s)" % (group_id, sys.exc_info()[0].__name__)) errors.append(grab_error(sys.exc_info())) return {'data':data,'errors':errors}
def callback_database(data, **kwargs): errors = [] ###################################################################################### # Saves a given model. # # In: # model - instance of aModel object ###################################################################################### def save(model): fields = {} for field in model._meta.get_all_field_names(): #if model._meta.get_field_by_name(field)[0].__class__.__name__ == 'DateTimeField': # continue # Interested only in possibly most defining fields if 'id' in field and field != 'id' and field != 'first_crawl': try: value = model.serializable_value(field) fields[field] = value except: pass clazz = __import__('mturk.main.models', {}, {}, [model.__class__.__name__]).__getattribute__(model.__class__.__name__) try: obj = clazz.objects.get(**fields) return obj except clazz.MultipleObjectsReturned: model.save() return model except clazz.DoesNotExist: model.save() return model ###################################################################################### # Saves any Model object nested in the given record. (Currently unused) # # In: # fields - dictionary representing a record ###################################################################################### def save_recursively(fields): for key in fields.keys(): if isinstance(fields[key], Model): fields[key] = save(fields[key]) #else: # if type(fields[key]) == type([]): # callback_database(fields[key]) return fields if type(data) != type([]): raise Exception, '::callback_database() must be called with one list argument' log.info('Saving results to database (%s records)' % len(data)) for record in data: try: if type(record) == type({}): for model in record.keys(): try: record[model] = save_recursively(record[model]) clazz = __import__('mturk.main.models', {}, {}, [model]).__getattribute__(model) obj = clazz(**record[model]) try: obj.save() except: for key,value in record[model].items(): if isinstance(value, Model): try: # value may be not saved in save_recursively because it is in DB already value.delete() except: pass raise Exception("Failed to save object with values:\n%s" % fields), None, sys.exc_info()[2] except: errors.append(grab_error(sys.exc_info())) else: if isinstance(record, Model): save(record) except: errors.append(grab_error(sys.exc_info())) return {'data':[],'errors':errors}
def callback_allhit(pages, **kwargs): if type(pages) != type([]): raise Exception, '::callback_allhit() must be called with one list argument' def remove_newline_fields(list): while True: try: list.remove("\n") except: break return list # def is_soup(object): # soup = BeautifulSoup() # if type(object) == type(soup) or type(object) == type(ResultSet('')) or type(object) == type(Tag(soup, "div", [])): # return True # return False data = [] errors = [] # Processing every page for page_number in pages: try: # Downloading page logging.info("Downloading page: %s" % page_number) page_url = get_allhit_url(page_number) logging.debug("Downloading %s" % page_url) response = urllib2.urlopen(page_url) html = response.read() soup = BeautifulSoup(html) # Parsing HIT groups' list table = soup.find('table', cellpadding='0', cellspacing='5', border='0', width='100%') if type(table) == type(None): i = 0 while i < 3: logging.warn("Soup returned an empty table for page %s. Trying once more" % page_number); response = urllib2.urlopen(page_url) html = response.read() soup = BeautifulSoup(html) table = soup.find('table', cellpadding='0', cellspacing='5', border='0', width='100%') if type(table) != type(None): break else: table = None soup = None html = None i = i + 1 if type(table) == type(None): logging.warn("Soup returned an empty table. This should not happen. Skipping page") continue table.contents = remove_newline_fields(table.contents) # Parsing and fetching information about each group for i_group in range(0,len(table.contents)): logging.debug("Processing group %s on page %s" % (i_group,page_number)) try: group_html = table.contents[i_group] # Title title = group_html.find('a', {'class':'capsulelink'}) if type(title) != type(None): try: title = str(title.contents[0]) except: title = unicode(title.contents[0]) try: title = unicode(remove_whitespaces(title)) except: title = '' fields = group_html.findAll('td', {'align':'left','valign':'top','class':'capsule_field_text'}) if len(fields) == 7: # Requester's name and ID requester_html = remove_newline_fields(fields[0].contents)[0] requester_name = unicode(requester_html.contents[0]) requester_id = requester_html['href'] start = requester_id.index('requesterId=')+12 stop = requester_id.index('&state') requester_id = requester_id[start:stop] # HIT group expiration date hit_expiration_date = remove_newline_fields(fields[1].contents)[0] hit_expiration_date = remove_whitespaces(strip_html(hit_expiration_date)) hit_expiration_date = hit_expiration_date[:hit_expiration_date.index('(')-2] hit_expiration_date = datetime.datetime.strptime(hit_expiration_date, '%b %d, %Y') # Time alloted time_alloted = remove_newline_fields(fields[2].contents)[0] time_alloted = remove_whitespaces(strip_html(time_alloted)) time_alloted = int(time_alloted[:time_alloted.index(' ')]) # Reward reward = float(remove_newline_fields(fields[3].contents)[0][1:]) # HITs available hits_available = int(remove_newline_fields(fields[4].contents)[0]) # Description description = unicode(remove_newline_fields(fields[5].contents)[0]) # Keywords keywords_raw = remove_newline_fields(fields[6].contents) keywords = [] for i in range(0, len(keywords_raw)): try: keyword = keywords_raw[i].contents[0] keywords.append(keyword) except: continue keywords = unicode(fuse(keywords, ',')) # Qualification qualifications = '' qfields = group_html.findAll('td', {'style':'padding-right: 2em; white-space: nowrap;'}) if len(qfields) > 0: qfields = [remove_whitespaces(unicode(remove_newline_fields(qfield.contents)[0])) for qfield in qfields] qualifications = fuse(qfields, ', ') qfields = None # Occurrence date occurrence_date = datetime.datetime.now() # Group ID group_id = group_html.find('span', {'class':'capsulelink'}) group_id_hashed = False if type(group_id) != type(None): group_id = remove_newline_fields(group_id.contents)[0] if 'href' in group_id._getAttrMap(): start = group_id['href'].index('groupId=')+8 stop = group_id['href'].index('&') group_id = group_id['href'][start:stop] else: group_id_hashed = True composition = "%s;%s;%s;%s;%s;%s;%s;" % (title,requester_id, time_alloted,reward, description,keywords, qualifications) composition = smart_str(composition) group_id = hashlib.md5(composition).hexdigest() # Checking whether processed content is already stored in the database hit_group_content = None try: logging.debug("group_id=%s; requester=%s; title=%s; desc=%s; ta=%s; reward=%s" % (group_id,requester_id,title,description,time_alloted,reward)) hit_group_content = HitGroupContent.objects.get(group_id=group_id, requester_id=requester_id, title=title, description=description, time_alloted=time_alloted, reward=reward, ) except HitGroupContent.DoesNotExist: hit_group_content = HitGroupContent(**{ 'title': title, 'requester_id': requester_id, 'requester_name': requester_name, 'time_alloted': time_alloted, 'reward': reward, 'html': '', 'description': description, 'keywords': keywords, 'qualifications': qualifications, 'occurrence_date': occurrence_date, 'group_id': group_id, 'group_id_hashed': group_id_hashed }) data.append({ 'HitGroupStatus': { 'group_id': group_id, 'hits_available': hits_available, 'page_number': page_number, 'inpage_position': i_group+1, 'hit_expiration_date': hit_expiration_date, 'hit_group_content': hit_group_content } }) fields = None group_html = None except: logging.error("Failed to process group %s on %s page (%s)" % (i_group,page_number,sys.exc_info()[0].__name__)) errors.append(grab_error(sys.exc_info())) print grab_error(sys.exc_info()) table = None soup = None html = None except: logging.error("Failed to process page %d (%s)" % (page_number,sys.exc_info()[0].__name__)) errors.append(grab_error(sys.exc_info())) print grab_error(sys.exc_info()) return {'data':data,'errors':errors}
def callback_database(data, **kwargs): errors = [] ###################################################################################### # Saves a given model. # # In: # model - instance of aModel object ###################################################################################### def save(model): fields = {} for field in model._meta.get_all_field_names(): #if model._meta.get_field_by_name(field)[0].__class__.__name__ == 'DateTimeField': # continue # Interested only in possibly most defining fields if 'id' in field and field != 'id' and field != 'first_crawl': try: value = model.serializable_value(field) fields[field] = value except: pass clazz = __import__('mturk.main.models', {}, {}, [model.__class__.__name__]).__getattribute__(model.__class__.__name__) try: obj = clazz.objects.get(**fields) return obj except clazz.MultipleObjectsReturned: model.save() return model except clazz.DoesNotExist: model.save() return model ###################################################################################### # Saves any Model object nested in the given record. (Currently unused) # # In: # fields - dictionary representing a record ###################################################################################### def save_recursively(fields): for key in fields.keys(): if isinstance(fields[key], Model): fields[key] = save(fields[key]) #else: # if type(fields[key]) == type([]): # callback_database(fields[key]) return fields if type(data) != type([]): raise Exception, '::callback_database() must be called with one list argument' logging.info('Saving results to database (%s records)' % len(data)) for record in data: try: if type(record) == type({}): for model in record.keys(): try: record[model] = save_recursively(record[model]) clazz = __import__('mturk.main.models', {}, {}, [model]).__getattribute__(model) obj = clazz(**record[model]) try: obj.save() except: for key,value in record[model].items(): if isinstance(value, Model): try: # value may be not saved in save_recursively because it is in DB already value.delete() except: pass raise Exception("Failed to save object with values:\n%s" % fields), None, sys.exc_info()[2] except: errors.append(grab_error(sys.exc_info())) else: if isinstance(record, Model): save(record) except: errors.append(grab_error(sys.exc_info())) return {'data':[],'errors':errors}