def journals_applications_provenance(outfile_applications, outfile_accounts, outfile_reapps, conn): with codecs.open(outfile_applications, "wb", "utf-8") as f, codecs.open( outfile_accounts, "wb", "utf-8") as g, codecs.open(outfile_reapps, "wb", "utf-8") as h: out_applications = csv.writer(f) out_applications.writerow([ "Journal ID", "Journal Created", "Journal Reapplied", "Application ID", "Application Last Updated", "Application Status", "Published Diff", "Latest Edit Recorded", "Latest Accepted Recorded" ]) out_accounts = csv.writer(g) out_accounts.writerow([ "Journal ID", "Journal Created", "Journal Reapplied", "In DOAJ", "Missing Account ID" ]) out_reapps = csv.writer(h) out_reapps.writerow([ "Journal ID", "Journal Created", "Journal Reapplied", "Application ID", "Application Created", "Application Last Updated", "Application Last Manual Update", "Application Status", "Published Diff" ]) counter = 0 for result in esprit.tasks.scroll(conn, "journal", keepalive="45m"): counter += 1 journal = Journal(**result) print counter, journal.id # first figure out if there is a broken related application issns = journal.bibjson().issns() applications = Suggestion.find_by_issn(issns) latest = None for application in applications: if latest is None: latest = application if application.last_updated_timestamp > latest.last_updated_timestamp: latest = application if latest is None: continue jcreated = journal.created_timestamp reapp = journal.last_update_request print counter, journal.id, reapp if reapp is not None: jcreated = datetime.strptime(reapp, "%Y-%m-%dT%H:%M:%SZ") jcreated = adjust_timestamp(jcreated, JOURNAL_TIMEZONE_CUTOFF) app_lustamp = adjust_timestamp(latest.last_updated_timestamp, APP_TIMEZONE_CUTOFF) # app_man_lustamp = latest.last_manual_update_timestamp # no need to adjust this one app_man_lustamp = adjust_timestamp( latest.last_manual_update_timestamp, APP_TIMEZONE_CUTOFF) td = jcreated - app_lustamp mtd = jcreated - app_man_lustamp diff = td.total_seconds() mdiff = mtd.total_seconds() # was the journal created after the application by greater than the threshold? if diff > THRESHOLD: last_edit = "" last_accept = "" edit_query = deepcopy(PROV_QUERY) edit_query["query"]["bool"]["must"][0]["term"][ "resource_id.exact"] = latest.id edit_query["query"]["bool"]["must"][1]["term"][ "action.exact"] = "edit" provs = Provenance.q2obj(q=edit_query) if len(provs) > 0: last_edit = provs[0].last_updated accept_query = deepcopy(PROV_QUERY) accept_query["query"]["bool"]["must"][0]["term"][ "resource_id.exact"] = latest.id accept_query["query"]["bool"]["must"][1]["term"][ "action.exact"] = "status:accepted" provs = Provenance.q2obj(q=accept_query) if len(provs) > 0: last_accept = provs[0].last_updated out_applications.writerow([ journal.id, journal.created_date, journal.last_update_request, latest.id, latest.last_updated, latest.application_status, diff, last_edit, last_accept ]) # was the journal (in doaj) created before the application by greater than the threshold, and is it in a state other than rejected if mdiff < -1 * THRESHOLD and latest.application_status != constants.APPLICATION_STATUS_REJECTED and journal.is_in_doaj( ): out_reapps.writerow([ journal.id, journal.created_date, journal.last_update_request, latest.id, latest.created_date, latest.last_updated, latest.last_manual_update, latest.application_status, mdiff ]) # now figure out if the account is missing owner = journal.owner if owner is None: out_accounts.writerow([ journal.id, journal.created_date, journal.last_update_request, str(journal.is_in_doaj()), "NO OWNER" ]) else: acc = Account.pull(owner) if acc is None: out_accounts.writerow([ journal.id, journal.created_date, journal.last_update_request, str(journal.is_in_doaj()), owner ]) print "processed", counter, "journals"
def journals_applications_provenance(outfile_applications, outfile_accounts, outfile_reapps, conn): with codecs.open(outfile_applications, "wb", "utf-8") as f, codecs.open(outfile_accounts, "wb", "utf-8") as g, codecs.open(outfile_reapps, "wb", "utf-8") as h: out_applications = csv.writer(f) out_applications.writerow(["Journal ID", "Journal Created", "Journal Reapplied", "Application ID", "Application Last Updated", "Application Status", "Published Diff", "Latest Edit Recorded", "Latest Accepted Recorded"]) out_accounts = csv.writer(g) out_accounts.writerow(["Journal ID", "Journal Created", "Journal Reapplied", "In DOAJ", "Missing Account ID"]) out_reapps = csv.writer(h) out_reapps.writerow(["Journal ID", "Journal Created", "Journal Reapplied", "Application ID", "Application Created", "Application Last Updated", "Application Last Manual Update", "Application Status", "Published Diff"]) counter = 0 for result in esprit.tasks.scroll(conn, "journal", keepalive="45m"): counter += 1 journal = Journal(**result) print counter, journal.id # first figure out if there is a broken related application issns = journal.bibjson().issns() applications = Suggestion.find_by_issn(issns) latest = None for application in applications: if latest is None: latest = application if application.last_updated_timestamp > latest.last_updated_timestamp: latest = application if latest is None: continue jcreated = journal.created_timestamp reapp = journal.last_update_request print counter, journal.id, reapp if reapp is not None: jcreated = datetime.strptime(reapp, "%Y-%m-%dT%H:%M:%SZ") jcreated = adjust_timestamp(jcreated, JOURNAL_TIMEZONE_CUTOFF) app_lustamp = adjust_timestamp(latest.last_updated_timestamp, APP_TIMEZONE_CUTOFF) # app_man_lustamp = latest.last_manual_update_timestamp # no need to adjust this one app_man_lustamp = adjust_timestamp(latest.last_manual_update_timestamp, APP_TIMEZONE_CUTOFF) td = jcreated - app_lustamp mtd = jcreated - app_man_lustamp diff = td.total_seconds() mdiff = mtd.total_seconds() # was the journal created after the application by greater than the threshold? if diff > THRESHOLD: last_edit = "" last_accept = "" edit_query = deepcopy(PROV_QUERY) edit_query["query"]["bool"]["must"][0]["term"]["resource_id.exact"] = latest.id edit_query["query"]["bool"]["must"][1]["term"]["action.exact"] = "edit" provs = Provenance.q2obj(q=edit_query) if len(provs) > 0: last_edit = provs[0].last_updated accept_query = deepcopy(PROV_QUERY) accept_query["query"]["bool"]["must"][0]["term"]["resource_id.exact"] = latest.id accept_query["query"]["bool"]["must"][1]["term"]["action.exact"] = "status:accepted" provs = Provenance.q2obj(q=accept_query) if len(provs) > 0: last_accept = provs[0].last_updated out_applications.writerow([journal.id, journal.created_date, journal.last_update_request, latest.id, latest.last_updated, latest.application_status, diff, last_edit, last_accept]) # was the journal (in doaj) created before the application by greater than the threshold, and is it in a state other than rejected if mdiff < -1 * THRESHOLD and latest.application_status != constants.APPLICATION_STATUS_REJECTED and journal.is_in_doaj(): out_reapps.writerow([journal.id, journal.created_date, journal.last_update_request, latest.id, latest.created_date, latest.last_updated, latest.last_manual_update, latest.application_status, mdiff]) # now figure out if the account is missing owner = journal.owner if owner is None: out_accounts.writerow([journal.id, journal.created_date, journal.last_update_request, str(journal.is_in_doaj()), "NO OWNER"]) else: acc = Account.pull(owner) if acc is None: out_accounts.writerow([journal.id, journal.created_date, journal.last_update_request, str(journal.is_in_doaj()), owner]) print "processed", counter, "journals"