def link(cls): """ Link documents -> institutions. """ domain_to_inst = defaultdict(list) # Map domain -> [(regex, inst), ...] for inst in ServerSide(Institution.select()): domain = parse_domain(inst.url) regex = seed_to_regex(inst.url) domain_to_inst[domain].append((regex, inst)) for doc in query_bar(Document.select()): try: # TODO: Get rid of @property. url = doc.syllabus.url domain = parse_domain(url) # Find institutions with matching URLs. matches = [] for pattern, inst in domain_to_inst[domain]: match = pattern.search(url) if match: matches.append((match.group(), inst)) if matches: # Sort by length of match, descending. matches = sorted( matches, key=lambda x: len(x[0]), reverse=True, ) # Link to the institution with the longest match. cls.create( institution=matches[0][1], document=doc, ) except Exception as e: print(e)
def ingest_world(cls, package='osp.institutions', path='data/world.csv', ): """ Insert world universities. """ reader = read_csv(package, path) for row in reader: if row['country'] != 'US': # Normalize the URL. url = row['url'].strip() domain = parse_domain(url) # Clean the fields. name = row['name'].strip() country = row['country'].strip() try: cls.create( name=name, url=url, domain=domain, state=None, country=country, ) except IntegrityError: pass
def ingest_usa(cls, package='osp.institutions', path='data/usa.csv', ): """ Insert US universities. """ reader = read_csv(package, path) for row in reader: if row['e_country'] == 'USA': # Normalize the URL. url = row['web_url'].strip() domain = parse_domain(url) # Clean the fields. name = row['biz_name'].strip() state = row['e_state'].strip() try: cls.create( name=name, url=url, domain=domain, state=state, country='US', ) except IntegrityError: pass
def domain(self): """ Get the parsed domain of the syllabus' URL. Returns: str: The top-level domain. """ return parse_domain(self.url)
def test_parse_domain(url, domain): assert parse_domain(url) == domain