def tf_idfer(self): for i, (term, docs) in enumerate(self.vocab.items()): progressbar((i+1)/len(self.vocab), 'idfing') self.idf[term] = math.log(len(self.apps) / len(docs)) for i, (name, document) in enumerate(self.documents.items()): progressbar((i+1)/len(self.documents), 'tf-idfing') vector = self.vectorize(document) self.tf_idf_vectors[name] = [vector, math.sqrt(sum([v**2 for v in vector]))]
def find_all_apps(url, num_apps): apps = set(find_app_from_url(url)) while len(apps) < num_apps: progressbar(len(apps) / num_apps) for app in list(apps): apps.update(find_app_from_url(app)) progressbar(len(apps) / num_apps) if len(apps) >= num_apps: break return list(apps)[:num_apps]
def build_vocab(self): for i, app in enumerate(self.apps): doc, name = filter_out_description(get_page(app)) terms = self.process_document(doc) progressbar((i+1)/len(self.apps), name) self.documents[name] = terms term_set = set(terms) for term in term_set: self.vocab[term] = self.vocab.get(term, []) + [name] # Map each term in vocabulary to an index self.idx_map = {term: idx for idx, term in enumerate(self.vocab.keys())}
def query(self, query, k): terms = self.process_document(query) q_vector = self.q_vectorize(terms) if not sum(q_vector): return [[-1, 'Not found']] q_distance = math.sqrt(sum([v**2 for v in q_vector])) results = [] for i, (name, (vector, distance)) in enumerate(self.tf_idf_vectors.items()): progressbar((i+1)/len(self.tf_idf_vectors)) sim = sum([q_vector[i] * vector[i] for i in range(len(vector))]) / (q_distance * distance) results.append([sim, name]) return [(score, app) for score, app in sorted(results, reverse=True)[:k] if score]
def scrape_courses(urls): print("Reading all 7-9 courses") all_courses = [] all_schedules = [] all_program_courses = [] for i, (url, programID) in enumerate(urls): progressbar((i+1)/len(urls)) if url: courses, schedule, program_courses = get_courses(url, programID) all_courses += courses all_schedules += schedule all_program_courses += program_courses print() all_schedules = [[i] + x for i, x in enumerate(all_schedules)] all_courses = [[i] + x for i, x in enumerate(all_courses)] all_program_courses = [[i] + x for i, x in enumerate(all_program_courses)] return all_courses, all_schedules, all_program_courses
def scrape_programs(): utbildningar = get_utbildningar(url_utbildningar) # The educations programs = [] field_ID = 1 urls_all_courses = [] for key, value in utbildningar.items(): programs.append((value['ID'], key, value['href'])) utbildningar[key]['fields'], all_courses = get_fields(value['href']) urls_all_courses.append([all_courses, value['ID']]) fields = [] profiles = [] field_ID = 1 master_ID = 1 for key, value in utbildningar.items(): program_ID = value['ID'] for field, masters in value['fields'].items(): fields.append((field_ID, field, program_ID)) for master in masters: profiles.append((master_ID, master[0], field_ID, master[1])) master_ID += 1 field_ID += 1 print("Reading all profiles") course_profile = [] course_profile_cnt = 1 #return programs, fields, profiles, course_profile, urls_all_courses for i, (master_ID, master_name, field_ID, master_link) in enumerate(profiles): progressbar((i+1)/len(profiles)) courses = get_profile_courses(master_link) for course in courses: course_profile.append( (course_profile_cnt, course[0], master_ID, course[1])) course_profile_cnt += 1 print() return programs, fields, profiles, course_profile, urls_all_courses