Пример #1
0
 def get_max_page(self, html=None):
     if not html:
         response = urllib2.urlopen(get_allhit_url())
         html = response.read()
     pattern = re.compile(r"<a href=\".*pageNumber=([0-9]+).{150,200}Last</a>")
     max_page = re.search(pattern, html)
     if max_page: return int(max_page.group(1))
     else:        return 1
Пример #2
0
 def get_max_page(self, html=None):
     if not html:
         response = urllib2.urlopen(get_allhit_url())
         html = response.read()
     pattern = re.compile(
         r"<a href=\".*pageNumber=([0-9]+).{150,200}Last</a>")
     max_page = re.search(pattern, html)
     if max_page: return int(max_page.group(1))
     else: return 1
Пример #3
0
    def run(self):

        pid = Pid('mturk_crawler', True)

        logging.info('Crawler started')

        start_time = datetime.datetime.now()

        #Fetching statistical information about groups and HITs count
        logging.debug("Fetching stats")
        main_response = urllib2.urlopen(get_allhit_url())
        main_html = main_response.read()
        main_soup = BeautifulSoup(
            main_html,
            parseOnlyThese=SoupStrainer(
                text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)")))
        main_stats = [tag for tag in main_soup]
        hits_available = -1
        groups_available = -1
        if len(main_stats) > 1:
            hits_available_tmp = main_stats[0]
            hits_available_tmp = hits_available_tmp[:hits_available_tmp.
                                                    find(' ')].replace(
                                                        ',', '')
            hits_available = int(hits_available_tmp)
            groups_available_tmp = main_stats[1]
            groups_available_tmp = groups_available_tmp[
                groups_available_tmp.find('of') +
                3:groups_available_tmp.find('Results') - 1]
            groups_available = int(groups_available_tmp)
        main_soup = None

        #Fetching data from every mturk.com HITs list page
        logging.debug("Allhit processing")
        result_allhit = self.process_values(
            range(1,
                  self.get_max_page(main_html) + 1), callback_allhit,
            self.processes_count)
        self.data = result_allhit['data']
        self.append_errors(result_allhit['errors'])

        #Fetching html details for every HIT group
        logging.debug("Details processing")
        result_details = self.process_values(self.data, callback_details,
                                             self.processes_count)
        self.data = result_details['data']
        self.append_errors(result_details['errors'])

        hits_downloaded = sum(
            [hgs['HitGroupStatus']['hits_available'] for hgs in self.data])
        groups_downloaded = len(self.data)

        #Logging crawl information into the database
        success = False
        if groups_downloaded > 0 and hits_downloaded > 0 and groups_available / groups_downloaded <= 1.5 and hits_available / hits_downloaded <= 1.5:
            success = True

        logging.debug(
            "Crawl finished with success=%s. Saving main_crawl entry" %
            success)
        crawl = Crawl(
            **{
                'start_time': start_time,
                'end_time': datetime.datetime.now(),
                'success': success,
                'hits_available': hits_available,
                'hits_downloaded': hits_downloaded,
                'groups_available': groups_available,
                'groups_downloaded': groups_downloaded,
                #'errors':               str(self.errors) # !
                'errors': ''
            })
        crawl.save()

        #Adding crawl FK
        logging.debug("Adding FKs")
        result_add_crawlfk = self.process_values(self.data,
                                                 callback_add_crawlfk,
                                                 crawl=crawl)
        self.data = result_add_crawlfk['data']
        self.append_errors(result_add_crawlfk['errors'])

        #Saving results in the database
        logging.debug("Saving results")
        result_save_database = self.process_values(self.data,
                                                   callback_database)
        self.append_errors(result_save_database['errors'])

        print self.errors

        logging.info(
            "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors"
            % ("" if success else "un", (datetime.datetime.now() - start_time),
               groups_downloaded, hits_downloaded, groups_available,
               hits_available, len(self.errors)))

        pid.remove_pid()
def callback_allhit(pages, **kwargs):

    if type(pages) != type([]):
        raise Exception, '::callback_allhit() must be called with one list argument'

    def remove_newline_fields(list):
        while True:
            try:    list.remove("\n")
            except: break
        return list

#    def is_soup(object):
#        soup = BeautifulSoup()
#        if type(object) == type(soup) or type(object) == type(ResultSet('')) or type(object) == type(Tag(soup, "div", [])):
#            return True
#        return False

    data = []
    errors = []

    # Processing every page
    for page_number in pages:
        try:
            # Downloading page
            logging.info("Downloading page: %s" % page_number)
            page_url = get_allhit_url(page_number)
            logging.debug("Downloading %s"  % page_url)
            response = urllib2.urlopen(page_url)
            html = response.read()
            soup = BeautifulSoup(html)

            # Parsing HIT groups' list
            table = soup.find('table', cellpadding='0', cellspacing='5', border='0', width='100%')
            if type(table) == type(None):
                
                i = 0
                while i < 3:
                    logging.warn("Soup returned an empty table for page %s. Trying once more" % page_number);
                    response = urllib2.urlopen(page_url)
                    html = response.read()
                    soup = BeautifulSoup(html)
                    table = soup.find('table', cellpadding='0', cellspacing='5', border='0', width='100%')
                    if type(table) != type(None):
                        break
                    else:
                        table = None
                        soup = None
                        html = None
                        i = i + 1
                        
                if type(table) == type(None):
                    logging.warn("Soup returned an empty table. This should not happen. Skipping page")
                    continue
                
            table.contents = remove_newline_fields(table.contents)
    
            # Parsing and fetching information about each group
            for i_group in range(0,len(table.contents)):
                logging.debug("Processing group %s on page %s" % (i_group,page_number))
                try:
                    group_html = table.contents[i_group]
    
                    # Title
                    title = group_html.find('a', {'class':'capsulelink'})
                    if type(title) != type(None):
                        try:
                            title = str(title.contents[0])
                        except:
                            title = unicode(title.contents[0])
                        try:
                            title = unicode(remove_whitespaces(title))
                        except:
                            title = ''
    
                    fields = group_html.findAll('td', {'align':'left','valign':'top','class':'capsule_field_text'})
    
                    if len(fields) == 7:
    
                        # Requester's name and ID
                        requester_html = remove_newline_fields(fields[0].contents)[0]
                        requester_name = unicode(requester_html.contents[0])
                        requester_id = requester_html['href']
                        start = requester_id.index('requesterId=')+12
                        stop = requester_id.index('&state')
                        requester_id = requester_id[start:stop]
    
                        # HIT group expiration date
                        hit_expiration_date = remove_newline_fields(fields[1].contents)[0]
                        hit_expiration_date = remove_whitespaces(strip_html(hit_expiration_date))
                        hit_expiration_date = hit_expiration_date[:hit_expiration_date.index('(')-2]
                        hit_expiration_date = datetime.datetime.strptime(hit_expiration_date, '%b %d, %Y')
    
                        # Time alloted
                        time_alloted = remove_newline_fields(fields[2].contents)[0]
                        time_alloted = remove_whitespaces(strip_html(time_alloted))
                        time_alloted = int(time_alloted[:time_alloted.index(' ')])
    
                        # Reward
                        reward = float(remove_newline_fields(fields[3].contents)[0][1:])

                        # HITs available
                        hits_available = int(remove_newline_fields(fields[4].contents)[0])
    
                        # Description
                        description = unicode(remove_newline_fields(fields[5].contents)[0])
    
                        # Keywords
                        keywords_raw = remove_newline_fields(fields[6].contents)
                        keywords = []
                        for i in range(0, len(keywords_raw)):
                            try:
                                keyword = keywords_raw[i].contents[0]
                                keywords.append(keyword)
                            except:
                                continue
                        keywords = unicode(fuse(keywords, ','))

                        # Qualification
                        qualifications = ''
                        qfields = group_html.findAll('td', {'style':'padding-right: 2em; white-space: nowrap;'})
                        
                        if len(qfields) > 0:
                            qfields = [remove_whitespaces(unicode(remove_newline_fields(qfield.contents)[0])) for qfield in qfields]
                            qualifications = fuse(qfields, ', ')
                        qfields = None
                            
                        # Occurrence date
                        occurrence_date = datetime.datetime.now()
                            
                        # Group ID
                        group_id = group_html.find('span', {'class':'capsulelink'})
                        group_id_hashed = False
                        if type(group_id) != type(None):
                            group_id = remove_newline_fields(group_id.contents)[0]
                            if 'href' in group_id._getAttrMap():
                                start = group_id['href'].index('groupId=')+8
                                stop = group_id['href'].index('&')
                                group_id = group_id['href'][start:stop]
                            else:
                                group_id_hashed = True
                                composition = "%s;%s;%s;%s;%s;%s;%s;" % (title,requester_id,
                                                                         time_alloted,reward,
                                                                         description,keywords,
                                                                         qualifications)
                                composition = smart_str(composition)
                                group_id = hashlib.md5(composition).hexdigest()

                        # Checking whether processed content is already stored in the database
                        hit_group_content = None
                        try:
                            logging.debug("group_id=%s; requester=%s; title=%s; desc=%s; ta=%s; reward=%s" % (group_id,requester_id,title,description,time_alloted,reward))
                            hit_group_content = HitGroupContent.objects.get(group_id=group_id, 
                                                                            requester_id=requester_id, 
                                                                            title=title,
                                                                            description=description,
                                                                            time_alloted=time_alloted,
                                                                            reward=reward,
                                                                            )
                        except HitGroupContent.DoesNotExist:
                            hit_group_content = HitGroupContent(**{
                                    'title': title,
                                    'requester_id': requester_id,
                                    'requester_name': requester_name,
                                    'time_alloted': time_alloted,
                                    'reward': reward,
                                    'html': '',
                                    'description': description,
                                    'keywords': keywords,
                                    'qualifications': qualifications,
                                    'occurrence_date': occurrence_date,
                                    'group_id': group_id,
                                    'group_id_hashed': group_id_hashed
                                })
    
                        data.append({
                            'HitGroupStatus': {
                                'group_id': group_id,
                                'hits_available': hits_available,
                                'page_number': page_number,
                                'inpage_position': i_group+1,
                                'hit_expiration_date': hit_expiration_date,
                                'hit_group_content': hit_group_content
                            }
                        })

                    fields = None
                    group_html = None

                except:
                    logging.error("Failed to process group %s on %s page (%s)" % (i_group,page_number,sys.exc_info()[0].__name__))
                    errors.append(grab_error(sys.exc_info()))
                    print grab_error(sys.exc_info())
        
            table = None
            soup = None
            html = None
                    
        except:
            logging.error("Failed to process page %d (%s)" % (page_number,sys.exc_info()[0].__name__))
            errors.append(grab_error(sys.exc_info()))
            print grab_error(sys.exc_info())

    return {'data':data,'errors':errors}
Пример #5
0
    def run(self):
        
        pid = Pid('mturk_crawler', True)

        logging.info('Crawler started')

        start_time = datetime.datetime.now()
        
        #Fetching statistical information about groups and HITs count
        logging.debug("Fetching stats")
        main_response = urllib2.urlopen(get_allhit_url())
        main_html = main_response.read()
        main_soup = BeautifulSoup(main_html, parseOnlyThese=SoupStrainer(text=re.compile("(^[0-9,]+ HITs|of [0-9]+ Results)")))
        main_stats = [tag for tag in main_soup]
        hits_available = -1
        groups_available = -1
        if len(main_stats) > 1:
            hits_available_tmp = main_stats[0]
            hits_available_tmp = hits_available_tmp[:hits_available_tmp.find(' ')].replace(',', '')
            hits_available = int(hits_available_tmp)
            groups_available_tmp = main_stats[1]
            groups_available_tmp = groups_available_tmp[groups_available_tmp.find('of')+3:groups_available_tmp.find('Results')-1]
            groups_available = int(groups_available_tmp)
        main_soup = None

        #Fetching data from every mturk.com HITs list page
        logging.debug("Allhit processing")
        result_allhit = self.process_values(range(1,self.get_max_page(main_html)+1), callback_allhit, 
                                            self.processes_count)
        self.data = result_allhit['data']
        self.append_errors(result_allhit['errors'])

        #Fetching html details for every HIT group
        logging.debug("Details processing")
        result_details = self.process_values(self.data, callback_details, 
                                             self.processes_count)
        self.data = result_details['data']
        self.append_errors(result_details['errors'])
        
        hits_downloaded = sum([hgs['HitGroupStatus']['hits_available'] for hgs in self.data])
        groups_downloaded = len(self.data)

        #Logging crawl information into the database
        success = False
        if groups_downloaded > 0 and hits_downloaded > 0 and groups_available/groups_downloaded <= 1.5 and hits_available/hits_downloaded <= 1.5:
            success = True
        
        logging.debug("Crawl finished with success=%s. Saving main_crawl entry" % success)
        crawl = Crawl(**{
            'start_time':           start_time,
            'end_time':             datetime.datetime.now(),
            'success':              success,
            'hits_available':       hits_available,
            'hits_downloaded':      hits_downloaded,
            'groups_available':     groups_available,
            'groups_downloaded':    groups_downloaded,
            #'errors':               str(self.errors) # !
            'errors':               ''
        })
        crawl.save()

        #Adding crawl FK
        logging.debug("Adding FKs")
        result_add_crawlfk = self.process_values(self.data, callback_add_crawlfk, 
                                                 crawl=crawl)
        self.data = result_add_crawlfk['data']
        self.append_errors(result_add_crawlfk['errors'])

        #Saving results in the database
        logging.debug("Saving results")
        result_save_database = self.process_values(self.data, callback_database)
        self.append_errors(result_save_database['errors'])
        
        print self.errors

        logging.info(
            "Crawler finished %ssuccessfully in %s with %d results, %d HITs (of %d and %d) and %d errors" % (
                "" if success else "un",
                (datetime.datetime.now()-start_time),
                groups_downloaded,
                hits_downloaded,
                groups_available,
                hits_available,
                len(self.errors)
            )
        )
        
        pid.remove_pid()