def run_on_documents(func_page,
                     view_name,
                     start_key,
                     end_key,
                     row_count=0,
                     page_size=500):
    couch_page = CouchPaginator(db,
                                view_name,
                                page_size,
                                start_key=start_key,
                                end_key=end_key,
                                include_docs=True)

    while couch_page:
        func_page(couch_page)
        row_count += page_size

        logger.info("%i. getting new page" % (row_count))
        if couch_page.has_next:
            couch_page = CouchPaginator(db,
                                        view_name,
                                        page_size,
                                        start_key=couch_page.next,
                                        end_key=end_key,
                                        include_docs=True)
        else:
            couch_page = None

    print "number items = ", row_count
Пример #2
0
def fix_github_year():
    from totalimpact import item, tiredis
    myredis = tiredis.from_url(os.getenv("REDISTOGO_URL"), db=0)

    view_name = "queues/by_alias"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500
    start_key = ["url", "https://github.0000000"]
    end_key = ["url", "https://github.zzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)

    while page:
        for row in page:
            doc = row.doc
            print row.id
            try:
                doc["biblio"]["year"] = doc["biblio"]["create_date"][0:4]
                db.save(doc)
            except KeyError:
                pass
            row_count += 1
            print "."
        logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None

    print "number items = ", row_count
def dedup_merged_collections():
    from totalimpact import item, tiredis

    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500
    start_key = ["collection", "000"]
    end_key = ["collection", "00zz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)

    number_of_collections = {}
    size_of_collections = {}
    size_of_profile_collections = {}

    while page:
        for row in page:
            dedup_collections(row.id)
            #doc = row.doc

            row_count += 1
        logger.info("%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None
Пример #4
0
def delete_test_collections():
    view_name = "queues/latest-collections"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500

    start_key = [1, "000000000"]
    end_key = [1, "9999999"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)
    number_deleted = 0
    number_items = 0

    try:
        while page:
            for row in page:
                row_count += 1
                collection = row.doc
                logger.info(u"deleting test collection {cid}:{title}".format(
                    cid=collection["_id"], title=collection["title"]))
                number_deleted += 1
                number_items += len(collection["alias_tiids"])
                #db.delete(collection)
            logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id))
            if page.has_next:
                page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
            else:
                page = None

    except TypeError:
        pass

    print "number deleted = ", number_deleted
    print "number items = ", number_items
Пример #5
0
def update_github():
    from totalimpact import item, tiredis
    myredis = tiredis.from_url(os.getenv("REDISTOGO_URL"), db=0)

    view_name = "queues/by_alias"
    view_rows = db.view(view_name, include_docs=False)
    row_count = 0
    page_size = 500
    start_key = ["url", "https://github.0000000"]
    end_key = ["url", "https://github.zzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=False, start_key=start_key, end_key=end_key)

    while page:
        for row in page:
            tiid = row.id
            item.start_item_update([tiid], myredis, db, sleep_in_seconds=0.05)                        
            row_count += 1
            print "."
        logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None

    print "number items = ", row_count
Пример #6
0
def put_snaps_in_items():
    logger.debug(u"running put_snaps_in_items() now.")
    starttime = time.time()
    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500

    start_key = ["metric_snap", "000000000"]
    end_key = ["metric_snap", "zzzzzzzzzzzzzzzzzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)

    #for row in view_rows[startkey:endkey]:
    while page:
        for row in page:
            if not "metric_snap" in row.key[0]:
                #print "not a metric_snap so skipping", row.key
                continue
            #print row.key
            row_count += 1
            snap = row.doc
            item = db.get(snap["tiid"])

            if item:
                saving = True
                while saving:
                    try:
                        from totalimpact import item
                        updated_item = item.add_snap_data(item, snap)

                        # to decide the proper last modified date
                        snap_last_modified = snap["created"]
                        item_last_modified = item["last_modified"]
                        updated_item["last_modified"] = max(snap_last_modified, item_last_modified)
                        
                        logger.info(u"now on snap row %i, saving item %s back to db, deleting snap %s" % 
                            (row_count, updated_item["_id"], snap["_id"]))

                        db.save(updated_item)
                        #db.delete(snap)
                        saving = False
                    except couchdb.http.ResourceConflict:
                        logger.warning(u"couch conflict.  trying again")
                        pass
            else:
                logger.warning(u"now on snap row %i, couldn't get item %s for snap %s" % 
                    (row_count, snap["tiid"], snap["_id"]))

        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None

    logger.info(u"updated {rows} rows in {elapsed} seconds".format(
        rows=row_count, elapsed=round(time.time() - starttime)
    ))
Пример #7
0
def run_on_documents(func_page,
                     view_name,
                     start_key,
                     end_key,
                     skip_till_key,
                     row_count=0,
                     page_size=500):
    couch_page = CouchPaginator(couch_db,
                                view_name,
                                page_size,
                                start_key=start_key,
                                end_key=end_key,
                                include_docs=True)
    start_time = datetime.datetime.now()

    print "starting to loop through first {page_size} from {start_key} to {end_key}".format(
        page_size=page_size, start_key=start_key, end_key=end_key)

    while couch_page:
        func_page(couch_page, skip_till_key)
        row_count += page_size

        logger.info("%i. getting new page" % (row_count))
        elapsed_time = datetime.datetime.now() - start_time
        number_db_threads = max(1, threading.active_count() - 1)
        print "\n****** {timestamp} {start_key} took {elapsed_seconds} seconds to do {row_count} docs, so {minutes_per_10k} minutes per 10k docs per thread, {total}mins total *****".format(
            timestamp=datetime.datetime.now().isoformat(),
            start_key=start_key,
            row_count=row_count,
            elapsed_seconds=elapsed_time.seconds,
            minutes_per_10k=(elapsed_time.seconds) * 10000 / (row_count * 60),
            total=((elapsed_time.seconds) * 10000 /
                   (row_count * 60)) / number_db_threads)

        if couch_page.has_next:
            couch_page = CouchPaginator(couch_db,
                                        view_name,
                                        page_size,
                                        start_key=couch_page.next,
                                        end_key=end_key,
                                        include_docs=True)
        else:
            couch_page = None

    print "number items = ", row_count
    elapsed_time = datetime.datetime.now() - start_time

    print "took {elapsed_time} to do {row_count}".format(
        row_count=row_count, elapsed_time=elapsed_time)

    db.session.remove()
Пример #8
0
def lowercase_aliases():
    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500
    start_key = ["item", "000000000"]
    end_key = ["item", "zzzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)
    number_edited = 0

    while page:
        for row in page:
            item = row.doc
            edited = False
            row_count += 1
            if not item:
                continue
            if "aliases" in item:
                orig_aliases_dict = item["aliases"]

                lowercase_aliases_dict = {}
                for orig_namespace in orig_aliases_dict:
                    lowercase_namespace = orig_namespace.lower()
                    if lowercase_namespace == "doi":
                        lowercase_aliases_dict[lowercase_namespace] = [doi.lower() for doi in orig_aliases_dict[orig_namespace]]
                    else:
                        lowercase_aliases_dict[lowercase_namespace] = orig_aliases_dict[orig_namespace]

                if orig_aliases_dict != lowercase_aliases_dict:
                    print "\ndifference detected \n{orig}\n{lower}\n".format(
                        orig=orig_aliases_dict, lower=lowercase_aliases_dict)
                    item["aliases"] = lowercase_aliases_dict
                    number_edited += 1
                    db.save(item)
                else:
                    logger.info(u".")
                        
        print "number edited = ", number_edited
        print "number items = ", row_count
        logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None

    print "number edited = ", number_edited
    print "number items = ", row_count
Пример #9
0
def ensure_all_metric_values_are_ints():
    #except F1000 Yes's
    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500

    start_key = ["item", "000000000"]
    end_key = ["item", "zzzzzzzzzzzzzzzzzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)

    while page:
        for row in page:
            row_count += 1
            item = row.doc
            save_me = False
            try:
                metric_names = item["metrics"].keys()
                for metric_name in metric_names:
                    raw = item["metrics"][metric_name]["values"]["raw"]
                    if "scopus:citations" in metric_name:
                        logger.info(item["metrics"][metric_name])
                    if isinstance(raw, basestring):
                        if raw != "Yes":
                            logger.info(u"casting to int")
                            #logger.info(item)
                            item["metrics"][metric_name]["values"]["raw"] = int(raw)
                            raw = int(raw)
                            save_me=True
                    if not raw:
                        logger.info(u"removing a zero")
                        #logger.info(item)
                        del item["metrics"][metric_name]
                        save_me=True
                if save_me:
                    logger.info(u"saving")
                    db.save(item)
                else:
                    #logger.info(u"%i." %row_count)
                    pass
            except KeyError:
                pass
        logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None
Пример #10
0
def delete_all_delicious_and_facebook():
    #except F1000 Yes's
    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500

    start_key = ["item", "000000000"]
    end_key = ["item", "zzzzzzzzzzzzzzzzzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)

    number_saved = 0
    while page:
        for row in page:
            row_count += 1
            item = row.doc
            try:
                if item["metrics"]:
                    metric_names = item["metrics"].keys()
                    save_me = False
                    for metric_name in metric_names:
                        if metric_name.startswith("facebook") or metric_name.startswith("delicious"):
                            logger.info(u"{tiid} deleting {metric_name}".format(
                                metric_name=metric_name,
                                tiid=row.id))
                            del item["metrics"][metric_name]
                            save_me=True
                    if save_me:                    
                        db.save(item)
                        number_saved += 1
            except (KeyError, TypeError):
                pass
        logger.info(u"number saved {num}".format(
            num=number_saved))
        logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None
Пример #11
0
def delete_orphan_items():
    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500

    start_key = ["item", "000000000"]
    end_key = ["item", "zzzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)
    number_deleted = 0
    date_deleted = collections.defaultdict(int)

    while page:
        for row in page:
            row_count += 1
            tiid = row.key[1]
            item = row.doc

            tiid_in_collection_response = db.view("tiids_in_collections/tiids_in_collections", include_docs=False, key=tiid)
            tiid_in_collection = tiid_in_collection_response.rows
            print tiid_in_collection
            if len(tiid_in_collection) > 0:
                logger.info(u"\nitem {tiid} is in a collection, not deleting".format(tiid=tiid))
            else:
                logger.info(u"\nitem {tiid} is not in a collection, deleting.".format(tiid=tiid))
                try:
                    #db.delete(item)
                    number_deleted += 1
                    date_deleted[item["created"][0:10]] += 1
                except (TypeError, couchdb.http.ResourceNotFound):  #happens sometimes if already deleted
                    pass

        logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None
        print "number of items deleted", number_deleted
        print date_deleted
def merge_collections_for_profile():
    from totalimpact import item, tiredis

    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500
    start_key = ["user", "00000000000"]
    end_key = ["user", "zzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db,
                          view_name,
                          page_size,
                          include_docs=True,
                          start_key=start_key,
                          end_key=end_key)

    while page:
        for row in page:
            row_count += 1
            user_doc = row.doc

            if "profile_collection" in user_doc:
                #already updated
                if not user_doc["colls"]:
                    user_doc["profile_collection"] = None
                    print "updating profile_collection with None because no collections", row.id
                    db.save(user_doc)
                continue

            alias_tiid_tuples = []

            print "\nstill needs a profile_collection:", row.id,
            print user_doc

            try:
                my_collections = user_doc["colls"]
                for coll in my_collections:
                    collection_doc = db.get(coll)
                    alias_tiids = collection_doc["alias_tiids"]
                    alias_tiid_tuples += alias_tiids.items()

                profile_collection = None
                if (len(my_collections) == 1):
                    profile_collection = collection_doc["_id"]
                    print "only one collection so merged collection not needed"
                elif (len(my_collections) > 1):
                    merged_collection = make_similar_collection(
                        collection_doc, alias_tiid_tuples)

                    #save new collection
                    del collection_doc["_rev"]
                    try:
                        db.save(merged_collection)
                        print "saved merged collection", merged_collection[
                            "_id"]
                    except couchdb.http.ResourceConflict:
                        print "didn't save new merged collection because of document conflict... maybe already saved"

                    profile_collection = merged_collection["_id"]

                print profile_collection
                user_doc["profile_collection"] = profile_collection
                db.save(user_doc)
                print "saved user_doc with updated profile collection"
            except KeyError:
                raise
        logger.info("%i. getting new page, last id was %s" %
                    (row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db,
                                  view_name,
                                  page_size,
                                  start_key=page.next,
                                  end_key=end_key,
                                  include_docs=True)
        else:
            page = None
Пример #13
0
def merge_collections_for_profile():
    from totalimpact import item, tiredis

    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    sql_statement_count = 0
    page_size = 500
    start_key = ["user", "00000000000"]
    end_key = ["user", "zzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)

    email_data_strings = []

    while page:
        for row in page:

            row_count += 1
            user_doc = row.doc

            rowdata = {}
            rowdata["email"] = user_doc["_id"]
            if not user_doc["profile_collection"]:
                #print "not migrating this doc because it has no collections"
                continue
            rowdata["collection_id"] = user_doc["profile_collection"]
            try:
                rowdata["created"] = user_doc["created"]
            except KeyError:
                rowdata["created"] = datetime.datetime(2013, 1, 1).isoformat()                
            rowdata["password_hash"] = default_password_hash
            rowdata["url_slug"] = "user" + str(50000 + row_count)
            rowdata["given_name"] = "Firstname"
            rowdata["surname"] = "Lastname"

            insert_unless_error(insert_string('"user"', rowdata.keys()), [rowdata])
            sql_statement_count += 1

            # pull information together to send out surveymonkey email
            profile_id = user_doc["profile_collection"]
            email = user_doc["_id"]
            profile_doc = db.get(profile_id)
            my_collections = user_doc["colls"]

            title = profile_doc["title"]
            if (len(my_collections) > 1):
                title = ""
                for cid in my_collections:
                    coll_doc = db.get(cid)
                    collection_title = coll_doc["title"]
                    if collection_title != "My Collection":
                        title += "*" + collection_title

            try:
                collections_string = str(";".join(my_collections.keys()))
            except UnicodeEncodeError:
                print "UnicodeEncodeError on ", email, "so setting collections to blank"
                collections_string = ""

            email_data_strings += [u"{url_slug}|{profile_id}|{len_profile}|{email}|{created}|{title}|{collections_string}".format(
                url_slug=rowdata["url_slug"],
                profile_id=profile_id,
                email=email,
                len_profile=len(profile_doc["alias_tiids"]),
                created=rowdata["created"],
                title=title,
                collections_string=collections_string)]

        logger.info("%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None

    print "Number of rows: ", row_count
    print "Number of sql statements: ", sql_statement_count

    print "\n\n\n"
    for line in email_data_strings:
        print line
Пример #14
0
def remove_unused_item_doc_keys():
    view_name = "queues/by_type_and_id"
    view_rows = db.view(view_name, include_docs=True)
    row_count = 0
    page_size = 500
    start_key = ["item", "000000000"]
    end_key = ["item", "zzzzzzzzzz"]

    from couch_paginator import CouchPaginator
    page = CouchPaginator(db, view_name, page_size, include_docs=True, start_key=start_key, end_key=end_key)
    number_edited = 0

    while page:
        for row in page:
            item = row.doc
            edited = False
            row_count += 1
            try:
                if "providers_run" in item:
                    del item["providers_run"]
                    edited = True
                if "providersRunCounter" in item:
                    del item["providersRunCounter"]
                    edited = True
                if "providersWithMetricsCount" in item:
                    del item["providersWithMetricsCount"]
                    edited = True
                if "created" in item["aliases"]:
                    del item["aliases"]["created"]
                    edited = True
                if "last_modified" in item["aliases"]:
                    del item["aliases"]["last_modified"]
                    edited = True
                if "h1" in item["biblio"]:
                    h1_orig = item["biblio"]["h1"]
                    h1_updated = item["biblio"]["h1"].strip()
                    if h1_updated:
                        if h1_updated != h1_orig:
                            item["biblio"]["h1"] = h1_updated    
                            edited = True
                    else:                        
                        del item["biblio"]["h1"]
                        edited = True
            except TypeError:  #item sometimes NoneType
                pass

            if edited:
                print row.id
                print row.doc.keys(), row.doc["aliases"].keys(), row.doc["biblio"].keys()
                print item.keys(), item["aliases"].keys(), item["biblio"].keys()
                logger.info(u"saving modified item {tiid}\n".format(
                    tiid=item["_id"]))
                number_edited += 1
                db.save(item)
            else:
                logger.info(u".")

        print "number edited = ", number_edited
        print "number items = ", row_count
        logger.info(u"%i. getting new page, last id was %s" %(row_count, row.id))
        if page.has_next:
            page = CouchPaginator(db, view_name, page_size, start_key=page.next, end_key=end_key, include_docs=True)
        else:
            page = None

    print "number edited = ", number_edited
    print "number items = ", row_count