def run_on_documents(func_page, view_name, start_key, end_key, row_count=0, page_size=500): couch_page = CouchPaginator(db, view_name, page_size, start_key=start_key, end_key=end_key, include_docs=True) while couch_page: func_page(couch_page) row_count += page_size logger.info("%i. getting new page" % (row_count)) if couch_page.has_next: couch_page = CouchPaginator(db, view_name, page_size, start_key=couch_page.next, end_key=end_key, include_docs=True) else: couch_page = None print "number items = ", row_count
def fix_github_year(): from totalimpact import item, tiredis myredis = tiredis.from_url(os.getenv("REDISTOGO_URL"), db=0) view_name = "queues/by_alias" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["url", "https://github.0000000"] end_key = ["url", "https://github.zzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) while page: for row in page: doc = row.doc print row.id try: doc["biblio"]["year"] = doc["biblio"]["create_date"][0:4] db.save(doc) except KeyError: pass row_count += 1 print "." logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None print "number items = ", row_count
def dedup_merged_collections(): from totalimpact import item, tiredis view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["collection", "000"] end_key = ["collection", "00zz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) number_of_collections = {} size_of_collections = {} size_of_profile_collections = {} while page: for row in page: dedup_collections(row.id) #doc = row.doc row_count += 1 logger.info("%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None
def delete_test_collections(): view_name = "queues/latest-collections" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = [1, "000000000"] end_key = [1, "9999999"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) number_deleted = 0 number_items = 0 try: while page: for row in page: row_count += 1 collection = row.doc logger.info(u"deleting test collection {cid}:{title}".format( cid=collection["_id"], title=collection["title"])) number_deleted += 1 number_items += len(collection["alias_tiids"]) #db.delete(collection) logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None except TypeError: pass print "number deleted = ", number_deleted print "number items = ", number_items
def update_github(): from totalimpact import item, tiredis myredis = tiredis.from_url(os.getenv("REDISTOGO_URL"), db=0) view_name = "queues/by_alias" view_rows = db.view(view_name, include_docs=False) row_count = 0 page_size = 500 start_key = ["url", "https://github.0000000"] end_key = ["url", "https://github.zzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=False, start_key=start_key, end_key=end_key) while page: for row in page: tiid = row.id item.start_item_update([tiid], myredis, db, sleep_in_seconds=0.05) row_count += 1 print "." logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None print "number items = ", row_count
def put_snaps_in_items(): logger.debug(u"running put_snaps_in_items() now.") starttime = time.time() view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["metric_snap", "000000000"] end_key = ["metric_snap", "zzzzzzzzzzzzzzzzzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) #for row in view_rows[startkey:endkey]: while page: for row in page: if not "metric_snap" in row.key[0]: #print "not a metric_snap so skipping", row.key continue #print row.key row_count += 1 snap = row.doc item = db.get(snap["tiid"]) if item: saving = True while saving: try: from totalimpact import item updated_item = item.add_snap_data(item, snap) # to decide the proper last modified date snap_last_modified = snap["created"] item_last_modified = item["last_modified"] updated_item["last_modified"] = max(snap_last_modified, item_last_modified) logger.info(u"now on snap row %i, saving item %s back to db, deleting snap %s" % (row_count, updated_item["_id"], snap["_id"])) db.save(updated_item) #db.delete(snap) saving = False except couchdb.http.ResourceConflict: logger.warning(u"couch conflict. trying again") pass else: logger.warning(u"now on snap row %i, couldn't get item %s for snap %s" % (row_count, snap["tiid"], snap["_id"])) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None logger.info(u"updated {rows} rows in {elapsed} seconds".format( rows=row_count, elapsed=round(time.time() - starttime) ))
def run_on_documents(func_page, view_name, start_key, end_key, skip_till_key, row_count=0, page_size=500): couch_page = CouchPaginator(couch_db, view_name, page_size, start_key=start_key, end_key=end_key, include_docs=True) start_time = datetime.datetime.now() print "starting to loop through first {page_size} from {start_key} to {end_key}".format( page_size=page_size, start_key=start_key, end_key=end_key) while couch_page: func_page(couch_page, skip_till_key) row_count += page_size logger.info("%i. getting new page" % (row_count)) elapsed_time = datetime.datetime.now() - start_time number_db_threads = max(1, threading.active_count() - 1) print "\n****** {timestamp} {start_key} took {elapsed_seconds} seconds to do {row_count} docs, so {minutes_per_10k} minutes per 10k docs per thread, {total}mins total *****".format( timestamp=datetime.datetime.now().isoformat(), start_key=start_key, row_count=row_count, elapsed_seconds=elapsed_time.seconds, minutes_per_10k=(elapsed_time.seconds) * 10000 / (row_count * 60), total=((elapsed_time.seconds) * 10000 / (row_count * 60)) / number_db_threads) if couch_page.has_next: couch_page = CouchPaginator(couch_db, view_name, page_size, start_key=couch_page.next, end_key=end_key, include_docs=True) else: couch_page = None print "number items = ", row_count elapsed_time = datetime.datetime.now() - start_time print "took {elapsed_time} to do {row_count}".format( row_count=row_count, elapsed_time=elapsed_time) db.session.remove()
def lowercase_aliases(): view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["item", "000000000"] end_key = ["item", "zzzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) number_edited = 0 while page: for row in page: item = row.doc edited = False row_count += 1 if not item: continue if "aliases" in item: orig_aliases_dict = item["aliases"] lowercase_aliases_dict = {} for orig_namespace in orig_aliases_dict: lowercase_namespace = orig_namespace.lower() if lowercase_namespace == "doi": lowercase_aliases_dict[lowercase_namespace] = [doi.lower() for doi in orig_aliases_dict[orig_namespace]] else: lowercase_aliases_dict[lowercase_namespace] = orig_aliases_dict[orig_namespace] if orig_aliases_dict != lowercase_aliases_dict: print "\ndifference detected \n{orig}\n{lower}\n".format( orig=orig_aliases_dict, lower=lowercase_aliases_dict) item["aliases"] = lowercase_aliases_dict number_edited += 1 db.save(item) else: logger.info(u".") print "number edited = ", number_edited print "number items = ", row_count logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None print "number edited = ", number_edited print "number items = ", row_count
def ensure_all_metric_values_are_ints(): #except F1000 Yes's view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["item", "000000000"] end_key = ["item", "zzzzzzzzzzzzzzzzzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) while page: for row in page: row_count += 1 item = row.doc save_me = False try: metric_names = item["metrics"].keys() for metric_name in metric_names: raw = item["metrics"][metric_name]["values"]["raw"] if "scopus:citations" in metric_name: logger.info(item["metrics"][metric_name]) if isinstance(raw, basestring): if raw != "Yes": logger.info(u"casting to int") #logger.info(item) item["metrics"][metric_name]["values"]["raw"] = int(raw) raw = int(raw) save_me=True if not raw: logger.info(u"removing a zero") #logger.info(item) del item["metrics"][metric_name] save_me=True if save_me: logger.info(u"saving") db.save(item) else: #logger.info(u"%i." %row_count) pass except KeyError: pass logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None
def delete_all_delicious_and_facebook(): #except F1000 Yes's view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["item", "000000000"] end_key = ["item", "zzzzzzzzzzzzzzzzzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) number_saved = 0 while page: for row in page: row_count += 1 item = row.doc try: if item["metrics"]: metric_names = item["metrics"].keys() save_me = False for metric_name in metric_names: if metric_name.startswith("facebook") or metric_name.startswith("delicious"): logger.info(u"{tiid} deleting {metric_name}".format( metric_name=metric_name, tiid=row.id)) del item["metrics"][metric_name] save_me=True if save_me: db.save(item) number_saved += 1 except (KeyError, TypeError): pass logger.info(u"number saved {num}".format( num=number_saved)) logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None
def delete_orphan_items(): view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["item", "000000000"] end_key = ["item", "zzzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) number_deleted = 0 date_deleted = collections.defaultdict(int) while page: for row in page: row_count += 1 tiid = row.key[1] item = row.doc tiid_in_collection_response = db.view("tiids_in_collections/tiids_in_collections", include_docs=False, key=tiid) tiid_in_collection = tiid_in_collection_response.rows print tiid_in_collection if len(tiid_in_collection) > 0: logger.info(u"\nitem {tiid} is in a collection, not deleting".format(tiid=tiid)) else: logger.info(u"\nitem {tiid} is not in a collection, deleting.".format(tiid=tiid)) try: #db.delete(item) number_deleted += 1 date_deleted[item["created"][0:10]] += 1 except (TypeError, couchdb.http.ResourceNotFound): #happens sometimes if already deleted pass logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None print "number of items deleted", number_deleted print date_deleted
def merge_collections_for_profile(): from totalimpact import item, tiredis view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["user", "00000000000"] end_key = ["user", "zzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) while page: for row in page: row_count += 1 user_doc = row.doc if "profile_collection" in user_doc: #already updated if not user_doc["colls"]: user_doc["profile_collection"] = None print "updating profile_collection with None because no collections", row.id db.save(user_doc) continue alias_tiid_tuples = [] print "\nstill needs a profile_collection:", row.id, print user_doc try: my_collections = user_doc["colls"] for coll in my_collections: collection_doc = db.get(coll) alias_tiids = collection_doc["alias_tiids"] alias_tiid_tuples += alias_tiids.items() profile_collection = None if (len(my_collections) == 1): profile_collection = collection_doc["_id"] print "only one collection so merged collection not needed" elif (len(my_collections) > 1): merged_collection = make_similar_collection( collection_doc, alias_tiid_tuples) #save new collection del collection_doc["_rev"] try: db.save(merged_collection) print "saved merged collection", merged_collection[ "_id"] except couchdb.http.ResourceConflict: print "didn't save new merged collection because of document conflict... maybe already saved" profile_collection = merged_collection["_id"] print profile_collection user_doc["profile_collection"] = profile_collection db.save(user_doc) print "saved user_doc with updated profile collection" except KeyError: raise logger.info("%i. getting new page, last id was %s" % (row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None
def merge_collections_for_profile(): from totalimpact import item, tiredis view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 sql_statement_count = 0 page_size = 500 start_key = ["user", "00000000000"] end_key = ["user", "zzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) email_data_strings = [] while page: for row in page: row_count += 1 user_doc = row.doc rowdata = {} rowdata["email"] = user_doc["_id"] if not user_doc["profile_collection"]: #print "not migrating this doc because it has no collections" continue rowdata["collection_id"] = user_doc["profile_collection"] try: rowdata["created"] = user_doc["created"] except KeyError: rowdata["created"] = datetime.datetime(2013, 1, 1).isoformat() rowdata["password_hash"] = default_password_hash rowdata["url_slug"] = "user" + str(50000 + row_count) rowdata["given_name"] = "Firstname" rowdata["surname"] = "Lastname" insert_unless_error(insert_string('"user"', rowdata.keys()), [rowdata]) sql_statement_count += 1 # pull information together to send out surveymonkey email profile_id = user_doc["profile_collection"] email = user_doc["_id"] profile_doc = db.get(profile_id) my_collections = user_doc["colls"] title = profile_doc["title"] if (len(my_collections) > 1): title = "" for cid in my_collections: coll_doc = db.get(cid) collection_title = coll_doc["title"] if collection_title != "My Collection": title += "*" + collection_title try: collections_string = str(";".join(my_collections.keys())) except UnicodeEncodeError: print "UnicodeEncodeError on ", email, "so setting collections to blank" collections_string = "" email_data_strings += [u"{url_slug}|{profile_id}|{len_profile}|{email}|{created}|{title}|{collections_string}".format( url_slug=rowdata["url_slug"], profile_id=profile_id, email=email, len_profile=len(profile_doc["alias_tiids"]), created=rowdata["created"], title=title, collections_string=collections_string)] logger.info("%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None print "Number of rows: ", row_count print "Number of sql statements: ", sql_statement_count print "\n\n\n" for line in email_data_strings: print line
def remove_unused_item_doc_keys(): view_name = "queues/by_type_and_id" view_rows = db.view(view_name, include_docs=True) row_count = 0 page_size = 500 start_key = ["item", "000000000"] end_key = ["item", "zzzzzzzzzz"] from couch_paginator import CouchPaginator page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key) number_edited = 0 while page: for row in page: item = row.doc edited = False row_count += 1 try: if "providers_run" in item: del item["providers_run"] edited = True if "providersRunCounter" in item: del item["providersRunCounter"] edited = True if "providersWithMetricsCount" in item: del item["providersWithMetricsCount"] edited = True if "created" in item["aliases"]: del item["aliases"]["created"] edited = True if "last_modified" in item["aliases"]: del item["aliases"]["last_modified"] edited = True if "h1" in item["biblio"]: h1_orig = item["biblio"]["h1"] h1_updated = item["biblio"]["h1"].strip() if h1_updated: if h1_updated != h1_orig: item["biblio"]["h1"] = h1_updated edited = True else: del item["biblio"]["h1"] edited = True except TypeError: #item sometimes NoneType pass if edited: print row.id print row.doc.keys(), row.doc["aliases"].keys(), row.doc["biblio"].keys() print item.keys(), item["aliases"].keys(), item["biblio"].keys() logger.info(u"saving modified item {tiid}\n".format( tiid=item["_id"])) number_edited += 1 db.save(item) else: logger.info(u".") print "number edited = ", number_edited print "number items = ", row_count logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id)) if page.has_next: page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True) else: page = None print "number edited = ", number_edited print "number items = ", row_count