def kmenas(self, k): j = 0 centeroids = [] for i in range(k): centeroids.append(self.vectors[i]) # centeroids = [self.vectors[0], self.vectors[1]] iterations = 5 # k = len(centeroids) for iterate in range(iterations): update_progress(iterate, iterations) cluster_members = [[] for u in centeroids] for v in self.vectors: cluster_members[self.belongs_to_cluster(v, centeroids)].append(v) # for clnum in range(k): # print('members of cluster', clnum, ' = ', cluster_members[clnum]) # print(len(cluster_members[clnum])) for i in range(k): centeroids[i] = self.average_of_vectors(cluster_members[i]) labels = [self.similarity(u, centeroids[i]) for u in cluster_members[i]] # print("labels for cluster ", i, ": ", cluster_members[i][labels.index(max(labels))]) for temp in cluster_members[i]: j += sum(map(lambda x: (x[0] - x[1]) ** 2, zip(temp, centeroids[i]))) return j
def index(elastic, dir='./resources/jsonFiles'): path_to_json = dir json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')] for i, js in enumerate(json_files): with open(os.path.join(path_to_json, js)) as json_file: data = json.load(json_file) a = elastic.index(index='researchgate', doc_type='articles', id=i, body=data) update_progress(i, json_files.__len__())
def crawl(self): for start_link in self.starting_urls: downloader = Downloader(start_link) app = downloader.get_app_from_link() self.links_visited.add(start_link) self.apps.append(app) self.depth_links.append(app.in_links) self.depth_links.append(app.out_links) with open(str('./resources/jsonFiles/' + 'item_pipeline_' + 0 + '_' + app.uid + '.json'), 'w') as outfile: json.dump(app.__dict__, outfile) self.num_docs_crawled = 1 while self.num_docs_crawled < self.num_docs_to_be_crawled: current_in_links = [] current_out_links = [] count = 0 for link in self.depth_links[self.current_depth]: if link not in self.links_visited and count < self.in_degree : current_app = Downloader(link).get_app_from_link() if current_app is 0: continue current_in_links.extend(current_app.in_links) current_out_links.extend(current_app.out_links) with open(str('./resources/jsonFiles/' + 'item_pipeline_' + str(self.num_docs_crawled) + '_' + current_app.uid + '.json'), 'w') as outfile: json.dump(current_app.__dict__, outfile) update_progress(self.num_docs_crawled, self.num_docs_to_be_crawled) self.num_docs_crawled += 1 self.apps.append(current_app) self.links_visited.add(link) count += 1 self.depth_links.append(current_in_links) self.depth_links.append(current_out_links) self.current_depth += 1 current_in_links = [] current_out_links = [] count = 0 for link in self.depth_links[self.current_depth]: if link not in self.links_visited and count < self.out_degree: current_app = Downloader(link).get_app_from_link() if current_app is 0: continue current_in_links.extend(current_app.in_links) current_out_links.extend(current_app.out_links) with open(str('./resources/jsonFiles/' + 'item_pipeline_' + str(self.num_docs_crawled) + '_' + current_app.uid + '.json'), 'w') as outfile: json.dump(current_app.__dict__, outfile) update_progress(self.num_docs_crawled, self.num_docs_to_be_crawled) self.num_docs_crawled += 1 self.apps.append(current_app) self.links_visited.add(link) count += 1 self.current_depth += 1 self.depth_links.append(current_in_links) self.depth_links.append(current_out_links)