def __init__(self): self._scrapers = { 'name': [OrgNameScraper], 'address': [OrgAddressScraper], 'types': [OrgTypeScraper], 'phone_numbers': [USPhoneNumberScraper, IndianPhoneNumberScraper], 'emails': [EmailScraper], 'contacts': [ContactScraper], 'organization_url': [OrgUrlScraper], 'partners': [OrgPartnersScraper], 'facebook': [OrgFacebookScraper], 'twitter': [OrgTwitterScraper], 'keywords': [KeywordScraper], 'page_rank_info': [PageRankScraper] } self._multiple = [ 'types', 'phone_numbers', 'emails', 'partners', 'contacts' ] self._required_words = [ 'prostitution', 'sex trafficking', 'child labor', 'child labour', 'slavery', 'human trafficking', 'brothel', 'child trafficking', 'anti trafficking', 'social justice' ] self._punctuation = re.compile('[%s]' % re.escape(string.punctuation)) self.org_dao = OrganizationDAO self.url_frontier = URLFrontier()
def __init__(self): self.frontier = URLFrontier() self.contact_dao = ContactDAO() self.org_dao = OrganizationDAO() self.pub_dao = PublicationDAO() self.url_dao = URLMetadataDAO()