def sk_kmeans(core): #, kval=3 solrURL = "http://localhost:8983/solr/" + core solrInstance = Solr(solrURL) list_of_points = [] docs = solrInstance.query_iterator(query="*:*", start=0) for doc in docs: list_of_points.append(Vector(doc['id'], doc)) list_of_Dicts = (point.features for point in list_of_points) df = pd.DataFrame(list_of_Dicts) df = df.fillna(0) silhouettes = {} for k in range(2, 10): kmeans = KMeans( n_clusters=k, init='k-means++', max_iter=300, # k-means convergence n_init=10, # find global minima n_jobs=-2, # parallelize ) labels = kmeans.fit_predict(df) silhouettes[k] = silhouette_score(df, labels) return str(silhouettes)
def sk_kmeans(core): #, kval=3 solrURL = "http://localhost:8983/solr/" + core solrInstance = Solr(solrURL) list_of_points = [] docs = solrInstance.query_iterator(query="*:*", start=0) for doc in docs: list_of_points.append(Vector(doc['id'], doc)) list_of_Dicts = (point.features for point in list_of_points) df = pd.DataFrame(list_of_Dicts) df = df.fillna(0) silhouettes = {} for k in range(2, 10): kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, # k-means convergence n_init=10, # find global minima n_jobs=-2, # parallelize ) labels = kmeans.fit_predict(df) silhouettes[k] = silhouette_score(df, labels) return str(silhouettes)
o.write("\n") count += 1 return count def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://imagecat.dyndns.org:8983/solr/imagecatdev" filename = "docs.docs.jsonl" solr = Solr(url) docs = solr.query_iterator("lastModified:[1960-01-01T00:00:00Z TO 2005-12-31T00:00:00Z]", rows=1000, fl='id') count = store_stream(docs, filename) print("Wrote %d docs to %s" % (count, filename)) docs = read_stream(filename) updates = remove_last_modified(docs) count, success = solr.post_iterator(updates, False) print(success) print(count)
def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://imagecat.dyndns.org:8983/solr/imagecatdev" filename = "docs.docs.jsonl" solr = Solr(url) docs = solr.query_iterator( "lastModified:[1960-01-01T00:00:00Z TO 2005-12-31T00:00:00Z]", rows=1000, fl='id') count = store_stream(docs, filename) print("Wrote %d docs to %s" % (count, filename)) docs = read_stream(filename) updates = remove_last_modified(docs) count, success = solr.post_iterator(updates, False) print(success) print(count)
u['phonenumbers'] = {'set': d['ner_phone_number_ts_md']} u['ner_phone_number_ts_md'] = {'set': None} else: print("Error: Skipped") continue yield u def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://127.0.0.1:8983/solr/imagecatdev" solr = Solr(url) docs = solr.query_iterator("ner_phone_number_t_md:* OR ner_phone_number_ts_md:*", rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc") updates = fix_phonenumbers(docs) count, success = solr.post_iterator(updates, False, buffer_size=1000) solr.commit() print(success) print(count)
print("Error: Skipped") continue yield u def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://127.0.0.1:8983/solr/imagecatdev" solr = Solr(url) docs = solr.query_iterator( "ner_phone_number_t_md:* OR ner_phone_number_ts_md:*", rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc") updates = fix_phonenumbers(docs) count, success = solr.post_iterator(updates, False, buffer_size=1000) solr.commit() print(success) print(count)