Пример #1
0
    def tf_idfer(self):
        for i, (term, docs) in enumerate(self.vocab.items()):
            progressbar((i+1)/len(self.vocab), 'idfing')
            self.idf[term] = math.log(len(self.apps) / len(docs))

        for i, (name, document) in enumerate(self.documents.items()):
            progressbar((i+1)/len(self.documents), 'tf-idfing')
            vector = self.vectorize(document)
            self.tf_idf_vectors[name] = [vector, math.sqrt(sum([v**2 for v in vector]))]
Пример #2
0
def find_all_apps(url, num_apps):
    apps = set(find_app_from_url(url))
    while len(apps) < num_apps:
        progressbar(len(apps) / num_apps)
        for app in list(apps):
            apps.update(find_app_from_url(app))
            progressbar(len(apps) / num_apps)
            if len(apps) >= num_apps:
                break

    return list(apps)[:num_apps]
Пример #3
0
 def build_vocab(self):
     for i, app in enumerate(self.apps):
         doc, name = filter_out_description(get_page(app))
         terms = self.process_document(doc)
         progressbar((i+1)/len(self.apps), name)
         self.documents[name] = terms
         term_set = set(terms)
         for term in term_set:
             self.vocab[term] = self.vocab.get(term, []) + [name]
     
     # Map each term in vocabulary to an index
     self.idx_map = {term: idx for idx, term in enumerate(self.vocab.keys())}
Пример #4
0
    def query(self, query, k):
        terms = self.process_document(query)
        q_vector = self.q_vectorize(terms)

        if not sum(q_vector):
            return [[-1, 'Not found']]

        q_distance = math.sqrt(sum([v**2 for v in q_vector]))
        results = []

        for i, (name, (vector, distance)) in enumerate(self.tf_idf_vectors.items()):
            progressbar((i+1)/len(self.tf_idf_vectors))
            sim = sum([q_vector[i] * vector[i] for i in range(len(vector))]) / (q_distance * distance)
            results.append([sim, name])
        
        return [(score, app) for score, app in sorted(results, reverse=True)[:k] if score]
Пример #5
0
def scrape_courses(urls):
    print("Reading all 7-9 courses")
    all_courses = []
    all_schedules = []
    all_program_courses = []
    for i, (url, programID) in enumerate(urls):
        progressbar((i+1)/len(urls))
        if url:
            courses, schedule, program_courses = get_courses(url, programID)
            all_courses += courses
            all_schedules += schedule
            all_program_courses += program_courses
    print()
    all_schedules = [[i] + x for i, x in enumerate(all_schedules)]

    all_courses = [[i] + x for i, x in enumerate(all_courses)]

    all_program_courses = [[i] + x for i, x in enumerate(all_program_courses)]

    return all_courses, all_schedules, all_program_courses
Пример #6
0
def scrape_programs():
    utbildningar = get_utbildningar(url_utbildningar)
    # The educations
    programs = []
    field_ID = 1

    urls_all_courses = []
    for key, value in utbildningar.items():
        programs.append((value['ID'], key, value['href']))
        utbildningar[key]['fields'], all_courses = get_fields(value['href'])
        urls_all_courses.append([all_courses, value['ID']])

    fields = []
    profiles = []

    field_ID = 1
    master_ID = 1
    for key, value in utbildningar.items():
        program_ID = value['ID']
        for field, masters in value['fields'].items():
            fields.append((field_ID, field, program_ID))
            for master in masters:
                profiles.append((master_ID, master[0], field_ID, master[1]))
                master_ID += 1
            field_ID += 1

    print("Reading all profiles")
    course_profile = []
    course_profile_cnt = 1

    #return programs, fields, profiles, course_profile, urls_all_courses
    for i, (master_ID, master_name, field_ID, master_link) in enumerate(profiles):
        progressbar((i+1)/len(profiles))
        courses = get_profile_courses(master_link)

        for course in courses:
            course_profile.append( (course_profile_cnt, course[0], master_ID, course[1]))
            course_profile_cnt += 1
    print()
    return programs, fields, profiles, course_profile, urls_all_courses