def __init__(self): self._scrapers = { 'name': [OrgNameScraper], 'address': [OrgAddressScraper], 'types': [OrgTypeScraper], 'phone_numbers': [USPhoneNumberScraper, IndianPhoneNumberScraper], 'emails': [EmailScraper], 'contacts': [ContactScraper], 'organization_url': [OrgUrlScraper], 'partners': [OrgPartnersScraper], 'facebook': [OrgFacebookScraper], 'twitter': [OrgTwitterScraper], 'keywords': [KeywordScraper], 'page_rank_info': [PageRankScraper] } self._multiple = [ 'types', 'phone_numbers', 'emails', 'partners', 'contacts' ] self._required_words = [ 'prostitution', 'sex trafficking', 'child labor', 'child labour', 'slavery', 'human trafficking', 'brothel', 'child trafficking', 'anti trafficking', 'social justice' ] self._punctuation = re.compile('[%s]' % re.escape(string.punctuation)) self.org_dao = OrganizationDAO self.url_frontier = URLFrontier()
def __init__(self): self._scrapers = { 'name': [OrgNameScraper], 'address': [OrgAddressScraper], 'types': [OrgTypeScraper], 'phone_numbers': [USPhoneNumberScraper, IndianPhoneNumberScraper], 'emails': [EmailScraper], 'contacts': [ContactScraper], 'organization_url': [OrgUrlScraper], 'partners': [OrgPartnersScraper], 'facebook': [OrgFacebookScraper], 'twitter': [OrgTwitterScraper], 'keywords': [KeywordScraper], 'page_rank_info': [PageRankScraper] } self._multiple = ['types', 'phone_numbers', 'emails', 'partners', 'contacts'] self._required_words = ['prostitution', 'sex trafficking', 'child labor', 'child labour', 'slavery', 'human trafficking', 'brothel', 'child trafficking', 'anti trafficking', 'social justice'] self._punctuation = re.compile('[%s]' % re.escape(string.punctuation)) self.org_dao = OrganizationDAO self.url_frontier = URLFrontier()
class OrganizationScraper(): """A class that scrapes an Organization from a given page.""" def __init__(self): self._scrapers = { 'name': [OrgNameScraper], 'address': [OrgAddressScraper], 'types': [OrgTypeScraper], 'phone_numbers': [USPhoneNumberScraper, IndianPhoneNumberScraper], 'emails': [EmailScraper], 'contacts': [ContactScraper], 'organization_url': [OrgUrlScraper], 'partners': [OrgPartnersScraper], 'facebook': [OrgFacebookScraper], 'twitter': [OrgTwitterScraper], 'keywords': [KeywordScraper], 'page_rank_info': [PageRankScraper] } self._multiple = [ 'types', 'phone_numbers', 'emails', 'partners', 'contacts' ] self._required_words = [ 'prostitution', 'sex trafficking', 'child labor', 'child labour', 'slavery', 'human trafficking', 'brothel', 'child trafficking', 'anti trafficking', 'social justice' ] self._punctuation = re.compile('[%s]' % re.escape(string.punctuation)) self.org_dao = OrganizationDAO self.url_frontier = URLFrontier() def parse(self, response): organization = None flag = self.check_valid_org(response) if flag: organization = ScrapedOrganization() # Collect each field of organization model for field in self._scrapers.iterkeys(): if field in self._multiple: # Get multiple field (e.g. phone_number) organization[field] = [] for scraper in self._scrapers[field]: organization[field] += scraper().parse(response) elif field == 'contacts': organization[field] = [] else: # Get single field (e.g. name) results = (self._scrapers[field][0])().parse(response) if results: organization[field] = results[0] if isinstance( results, type([])) else results else: organization[field] = None return organization def check_valid_org(self, response): """ Checks if the current page is a valid page for an organization's homepage. Arguments: reponse (Response): Scrapy Response object of the page that is to be scraped. Returns: True if it's a valid organization page or already in the database. False if it's not the homepage. """ # If already in database, then valid url = OrgUrlScraper().parse(response) org_dto = self.org_dao().find(organization_url=url) if org_dto: return True # If not homepage, then return false and make sure homepage is added to scrape: home_url_obj = urlparse(response.request.url) if home_url_obj.path and home_url_obj.path is not '/': home_url = home_url_obj.scheme + '://' + home_url_obj.netloc + '/' home_domain = UrlUtility.get_domain(home_url) meta = URLMetadata(url=home_url, domain=home_domain, last_visited=datetime(1, 1, 1)) self.url_frontier.put_url(meta) return False else: # this is homepage, scrape for keywords hxs = HtmlXPathSelector(response) site_text = hxs.select('//html//text()').extract() site_text = [ element.strip() for element in site_text if element.strip() != '' ] for word in self._required_words: for sentence in site_text: sentence = self._punctuation.sub(' ', sentence) if word in sentence.lower(): return True # no keyword found, check if we already added organization return False
def __init__(self): self.frontier = URLFrontier() self.contact_dao = ContactDAO() self.org_dao = OrganizationDAO() self.pub_dao = PublicationDAO() self.url_dao = URLMetadataDAO()
class OrganizationScraper(): """A class that scrapes an Organization from a given page.""" def __init__(self): self._scrapers = { 'name': [OrgNameScraper], 'address': [OrgAddressScraper], 'types': [OrgTypeScraper], 'phone_numbers': [USPhoneNumberScraper, IndianPhoneNumberScraper], 'emails': [EmailScraper], 'contacts': [ContactScraper], 'organization_url': [OrgUrlScraper], 'partners': [OrgPartnersScraper], 'facebook': [OrgFacebookScraper], 'twitter': [OrgTwitterScraper], 'keywords': [KeywordScraper], 'page_rank_info': [PageRankScraper] } self._multiple = ['types', 'phone_numbers', 'emails', 'partners', 'contacts'] self._required_words = ['prostitution', 'sex trafficking', 'child labor', 'child labour', 'slavery', 'human trafficking', 'brothel', 'child trafficking', 'anti trafficking', 'social justice'] self._punctuation = re.compile('[%s]' % re.escape(string.punctuation)) self.org_dao = OrganizationDAO self.url_frontier = URLFrontier() def parse(self, response): organization = None flag = self.check_valid_org(response) if flag: organization = ScrapedOrganization() # Collect each field of organization model for field in self._scrapers.iterkeys(): if field in self._multiple: # Get multiple field (e.g. phone_number) organization[field] = [] for scraper in self._scrapers[field]: organization[field] += scraper().parse(response) elif field == 'contacts': organization[field] = [] else: # Get single field (e.g. name) results = (self._scrapers[field][0])().parse(response) if results: organization[field] = results[0] if isinstance(results, type([])) else results else: organization[field] = None return organization def check_valid_org(self, response): """ Checks if the current page is a valid page for an organization's homepage. Arguments: reponse (Response): Scrapy Response object of the page that is to be scraped. Returns: True if it's a valid organization page or already in the database. False if it's not the homepage. """ # If already in database, then valid url = OrgUrlScraper().parse(response) org_dto = self.org_dao().find(organization_url=url) if org_dto: return True # If not homepage, then return false and make sure homepage is added to scrape: home_url_obj = urlparse(response.request.url) if home_url_obj.path and home_url_obj.path is not '/': home_url = home_url_obj.scheme + '://' + home_url_obj.netloc + '/' home_domain = UrlUtility.get_domain(home_url) meta = URLMetadata(url=home_url, domain=home_domain, last_visited=datetime(1, 1, 1)) self.url_frontier.put_url(meta) return False else: # this is homepage, scrape for keywords hxs = HtmlXPathSelector(response) site_text = hxs.select('//html//text()').extract() site_text = [element.strip() for element in site_text if element.strip() != ''] for word in self._required_words: for sentence in site_text: sentence = self._punctuation.sub(' ', sentence) if word in sentence.lower(): return True # no keyword found, check if we already added organization return False