Exemplo n.º 1
0
def content_reports(fr, to, outdir):
    report = {}

    q = ContentByDate(fr, to)
    res = models.Suggestion.query(q=q.query())
    year_buckets = res.get("aggregations", {}).get("years",
                                                   {}).get("buckets", [])
    for years in year_buckets:
        ds = years.get("key_as_string")
        do = dates.parse(ds)
        year = do.year
        if year not in report:
            report[year] = {}
        country_buckets = years.get("countries", {}).get("buckets", [])
        for country in country_buckets:
            cc = country.get("key")
            cn = datasets.get_country_name(cc)
            if cn not in report[year]:
                report[year][cn] = {}
            count = country.get("doc_count")
            report[year][cn]["count"] = count

    table = _tabulate_time_entity_group(report, "Country")

    filename = "applications_by_year_by_country__" + _fft(fr) + "_to_" + _fft(
        to) + "__on_" + dates.today() + ".csv"
    outfiles = []
    outfile = os.path.join(outdir, filename)
    outfiles.append(outfile)
    with codecs.open(outfile, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        for row in table:
            writer.writerow(row)

    return outfiles
Exemplo n.º 2
0
def content_reports(fr, to, outdir):
    report = {}

    q = ContentByDate(fr, to)
    res = models.Suggestion.query(q=q.query())
    year_buckets = res.get("aggregations", {}).get("years", {}).get("buckets", [])
    for years in year_buckets:
        ds = years.get("key_as_string")
        do = dates.parse(ds)
        year = do.year
        if year not in report:
            report[year] = {}
        country_buckets = years.get("countries", {}).get("buckets", [])
        for country in country_buckets:
            cc = country.get("key")
            cn = datasets.get_country_name(cc)
            if cn not in report[year]:
                report[year][cn] = {}
            count = country.get("doc_count")
            report[year][cn]["count"] = count

    table = _tabulate_time_entity_group(report, "Country")

    filename = "applications_by_year_by_country__" + _fft(fr) + "_to_" + _fft(to) + "__on_" + dates.today() + ".csv"
    outfiles = []
    outfile = os.path.join(outdir, filename)
    outfiles.append(outfile)
    with codecs.open(outfile, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        for row in table:
            writer.writerow(row)

    return outfiles
def applications_inconsistencies(outfile_later, outfile_missing, conn):
    with codecs.open(outfile_later, "wb", "utf-8") as f, codecs.open(outfile_missing, "wb", "utf-8") as g:

        out_later = csv.writer(f)
        out_later.writerow(["Application ID", "Application Last Updated", "Latest Provenance Recorded", "Difference"])

        out_missing = UnicodeWriter(g)
        out_missing.writerow(["Application ID", "Application Last Manual Update", "Latest Provenance Record", "ISSNs", "Title"])

        counter = 0
        for result in esprit.tasks.scroll(conn, "suggestion", keepalive="45m"):
            counter += 1
            application = Suggestion(**result)
            print counter, application.id

            # Part 1 - later provenance records exist
            latest_prov = Provenance.get_latest_by_resource_id(application.id)
            if latest_prov is not None:
                lustamp = adjust_timestamp(application.last_updated_timestamp, APP_TIMEZONE_CUTOFF)
                created = latest_prov.created_date
                pstamp = latest_prov.created_timestamp
                td = pstamp - lustamp
                diff = td.total_seconds()

                if diff > THRESHOLD:
                    out_later.writerow([application.id, application.last_updated, created, diff])

            # Part 2 - missing journals
            if application.application_status == constants.APPLICATION_STATUS_ACCEPTED:
                missing = False

                # find the matching journals by issn or by title
                matching_journals = Journal.find_by_issn(application.bibjson().issns())
                if len(matching_journals) == 0:
                    # Have another go, find by title
                    matching_journals = Journal.find_by_title(application.bibjson().title)

                # if there are no matching journals, it is missing.
                if len(matching_journals) == 0:
                    missing = True
                else:
                    # if there are matching journals, find out if any of them are in the doaj.  If none, then journal is still missing
                    those_in_doaj = len([j for j in matching_journals if j.is_in_doaj()])
                    if those_in_doaj == 0:
                        missing = True

                # if the journal is missing, record it
                if missing:
                    created = ""
                    if latest_prov is not None:
                        created = latest_prov.created_date
                    out_missing.writerow([application.id, application.last_manual_update, created, " ".join(application.bibjson().issns()), application.bibjson().title])

        print "processed", counter, "suggestions"
Exemplo n.º 4
0
def report_to_csv(result_generator, headers, output_map, output_file):
    with codecs.open(output_file, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(headers)

        for result in result_generator:
            record = output_map(result)
            row = []
            for h in headers:
                row.append(record[h])
            writer.writerow(row)
Exemplo n.º 5
0
def make_csv(start, end, out):
    q = {
        "query" : {
            "bool" : {
                "must" : [
                    {"range" : {"last_updated" : {"gte" : start, "lte" : end}}},
                    {"term" : {"admin.application_status" : "rejected"}}
                ]
            }
        }
    }

    conn = esprit.raw.Connection(app.config.get("ELASTIC_SEARCH_HOST"), app.config.get("ELASTIC_SEARCH_DB"))

    with codecs.open(out, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["ID", "Last Updated", "Is Quick Reject?", "Suggester Name", "Suggester Email", "Owner ID", "Owner Name", "Owner Email",
                         "Title", "ISSNS", "Quick Reject Note Date", "All Notes"])
        for source in esprit.tasks.scroll(conn, "suggestion", q):
            application = models.Suggestion(**source)

            qr_note = None
            for n in application.notes:
                if ": This application was rejected with the reason '" in n.get("note"):
                    qr_note = n
                    break

            owner_id = application.owner
            owner = None
            if owner_id is not None:
                owner = models.Account.pull(owner_id)

            summary = {
                "id" : application.id,
                "last_updated" : application.last_updated,
                "quick_reject_note_date" : "" if qr_note is None else qr_note.get("date"),
                "suggester_name" : application.suggester.get("name"),
                "suggester_email" : application.suggester.get("email"),
                "owner" : application.owner,
                "owner_name" : owner.name if owner is not None else "",
                "owner_email" : owner.email if owner is not None else "",
                "notes" : "\n".join([n.get("date") + " - " + n.get("note") for n in application.notes]),
                "is_quick_reject" : "True" if qr_note is not None else "",
                "title" : application.bibjson().title,
                "issns" : ",".join(application.bibjson().issns())
            }

            writer.writerow([summary["id"], summary["last_updated"], summary["is_quick_reject"], summary["suggester_name"],
                             summary["suggester_email"], summary["owner"], summary["owner_name"], summary["owner_email"],
                             summary["title"], summary["issns"], summary["quick_reject_note_date"], summary["notes"]])
    def _create_article_csv(connection, file_object):
        """ Create a CSV file with the minimum information we require to find and report duplicates. """

        csv_writer = UnicodeWriter(file_object, quoting=csv.QUOTE_ALL)

        # Scroll through all articles, newest to oldest
        scroll_query = {
            "_source": [
                "id",
                "created_date",
                "bibjson.identifier",
                "bibjson.link",
                "bibjson.title",
                "admin.in_doaj"
            ],
            "query": {
                "match_all": {}
            },
            "sort": [
                {"last_updated": {"order": "desc"}}
            ]
        }

        count = 0
        for a in esprit.tasks.scroll(connection, 'article', q=scroll_query, page_size=1000, keepalive='1m'):
            row = [
                a['id'],
                a['created_date'],
                json.dumps(a['bibjson']['identifier']),
                json.dumps(a['bibjson'].get('link', [])),
                a['bibjson'].get('title', ''),
                json.dumps(a.get('admin', {}).get('in_doaj', ''))
            ]
            csv_writer.writerow(row)
            count += 1

        return count
Exemplo n.º 7
0
def provenance_reports(fr, to, outdir):
    pipeline = [
        ActionCounter("edit", "month"),
        ActionCounter("edit", "year"),
        StatusCounter("month"),
        StatusCounter("year")
    ]

    q = ProvenanceList(fr, to)
    for prov in models.Provenance.iterate(q.query()):
        for filt in pipeline:
            filt.count(prov)

    outfiles = []
    for p in pipeline:
        table = p.tabulate()
        outfile = os.path.join(outdir, p.filename(fr, to))
        outfiles.append(outfile)
        with codecs.open(outfile, "wb", "utf-8") as f:
            writer = UnicodeWriter(f)
            for row in table:
                writer.writerow(row)

    return outfiles
Exemplo n.º 8
0
def provenance_reports(fr, to, outdir):
    pipeline = [
        ActionCounter("edit", "month"),
        ActionCounter("edit", "year"),
        StatusCounter("month"),
        StatusCounter("year")
    ]

    q = ProvenanceList(fr, to)
    for prov in models.Provenance.iterate(q.query()):
        for filt in pipeline:
            filt.count(prov)

    outfiles = []
    for p in pipeline:
        table = p.tabulate()
        outfile = os.path.join(outdir, p.filename(fr, to))
        outfiles.append(outfile)
        with codecs.open(outfile, "wb", "utf-8") as f:
            writer = UnicodeWriter(f)
            for row in table:
                writer.writerow(row)

    return outfiles
Exemplo n.º 9
0
def report_to_csv(result_generator, headers, output_map, output_file):
    with codecs.open(output_file, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(headers)

        for result in result_generator:
            record = output_map(result)
            row = []
            for h in headers:
                row.append(record[h])
            writer.writerow(row)
Exemplo n.º 10
0
                    yield publisher_account
                    break


if __name__ == "__main__":

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--out", help="output file path")
    args = parser.parse_args()

    if not args.out:
        print "Please specify an output file path with the -o option"
        parser.print_help()
        exit()

    conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"], None, app.config["ELASTIC_SEARCH_DB"])

    with codecs.open(args.out, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["ID", "Name", "Email", "Created", "Last Updated"])

        for account in publishers_with_journals():
            writer.writerow([
                account.id,
                account.name,
                account.email,
                account.created_date,
                account.last_updated
            ])
Exemplo n.º 11
0
def make_bulk_reapp_csvs():
    acc = models.Account.all()
    failed_bulk_reapps = []
    email_list_10_plus = []
    email_list_less_10 = []
    separator_list = [",", " or ", "/"]
    for a in acc:
        try:
            made, num = make_csv_for_account(a)
        except ReappCsvException as e:
            failed_bulk_reapps.append({e.account_id : e.message})
            continue

        contact = []
        emails = a.email

        for sep in separator_list:
            if isinstance(emails, basestring):
                if sep in emails:
                    emails = emails.split(sep)

        if made:
            if isinstance(emails, basestring):
                contact.append(a.id)
                contact.append(emails)
                email_list_10_plus.append(contact)
            else:
                for e in emails:
                    e = e.strip()
                    contact.append(a.id)
                    contact.append(e)
                    email_list_10_plus.append(contact)
                    contact = []

        elif num > 0:  # only add to the email list if they actually have suggestions at all
            for sep in separator_list:
                if isinstance(emails, basestring):
                    if sep in emails:
                        emails = emails.split(sep)

            if isinstance(emails, basestring):
                contact.append(a.id)
                contact.append(emails)
                email_list_less_10.append(contact)
            else:
                for e in emails:
                    e = e.strip()
                    contact.append(a.id)
                    contact.append(e)
                    email_list_less_10.append(contact)
                    contact = []

    with codecs.open('email_list_11_plus.csv', 'wb', encoding='utf-8') as csvfile:
        wr_writer = UnicodeWriter(csvfile, quoting=csv.QUOTE_ALL)
        wr_writer.writerows(email_list_10_plus)

    with codecs.open('email_list_less_11.csv', 'wb', encoding='utf-8') as csvfile:
        wr_writer = UnicodeWriter(csvfile, quoting=csv.QUOTE_ALL)
        wr_writer.writerows(email_list_less_10)

    if failed_bulk_reapps:
        print "Failed bulk reapplications"
        print failed_bulk_reapps
Exemplo n.º 12
0
if __name__ == "__main__":

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--out", help="output file path")
    args = parser.parse_args()

    if not args.out:
        print "Please specify an output file path with the -o option"
        parser.print_help()
        exit()

    conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"], None, app.config["ELASTIC_SEARCH_DB"])

    with codecs.open(args.out, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["ID", "Journal Name", "E-ISSN", "P-ISSN", "Article Count"])

        for j in esprit.tasks.scroll(conn, models.Journal.__type__, q=NOT_IN_DOAJ, page_size=100, keepalive='5m'):
            journal = models.Journal(_source=j)
            bibjson = journal.bibjson()
            issns = bibjson.issns()
            count = models.Article.count_by_issns(issns)

            if count > 0:
                writer.writerow([journal.id, bibjson.title, bibjson.get_one_identifier(bibjson.E_ISSN), bibjson.get_one_identifier(bibjson.P_ISSN), count])




Exemplo n.º 13
0
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-o",
        "--out",
        help=
        "Output directory into which reports should be made (will be created if it doesn't exist)",
        default="publisher_emails_" + dates.today())
    parser.add_argument(
        "-e",
        "--email",
        help=
        "Send zip archived reports to email addresses configured via REPORTS_EMAIL_TO in settings",
        action='store_true')
    args = parser.parse_args()

    outdir = args.out
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    filename = "publisher_emails_in_doaj_" + dates.today() + ".csv"
    outfile = os.path.join(outdir, filename)
    with codecs.open(outfile, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["Account ID", "Email Address"])
        for p in publishers_with_journals():
            writer.writerow([p.id, p.email])

    if args.email:
        email_archive(outdir, "publisher_emails_in_doaj_" + dates.today())
Exemplo n.º 14
0
if __name__ == "__main__":

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--out", help="output file path")
    args = parser.parse_args()

    if not args.out:
        print "Please specify an output file path with the -o option"
        parser.print_help()
        exit()

    conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"], None, app.config["ELASTIC_SEARCH_DB"])

    with codecs.open(args.out, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["ID", "Name", "Email", "Created", "Last Updated", "Updated Since Create?", "Has Reset Token", "Reset Token Expired?"])

        for account in publishers_with_journals():

            has_reset = account.reset_token is not None
            is_expired = account.is_reset_expired() if has_reset is True else ""

            updated_since_create = account.created_timestamp < account.last_updated_timestamp

            writer.writerow([
                account.id,
                account.name,
                account.email,
                account.created_date,
                account.last_updated,
Exemplo n.º 15
0
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--out", help="output file path")
    args = parser.parse_args()

    if not args.out:
        print "Please specify an output file path with the -o option"
        parser.print_help()
        exit()

    conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"],
                                      None, app.config["ELASTIC_SEARCH_DB"])

    with codecs.open(args.out, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(
            ["ID", "Journal Name", "E-ISSN", "P-ISSN", "Article Count"])

        for j in esprit.tasks.scroll(conn,
                                     models.Journal.__type__,
                                     q=NOT_IN_DOAJ,
                                     page_size=100,
                                     keepalive='5m'):
            journal = models.Journal(_source=j)
            bibjson = journal.bibjson()
            issns = bibjson.issns()
            count = models.Article.count_by_issns(issns)

            if count > 0:
                writer.writerow([
Exemplo n.º 16
0
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--out", help="output file path")
    args = parser.parse_args()

    if not args.out:
        print "Please specify an output file path with the -o option"
        parser.print_help()
        exit()

    conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"],
                                      None, app.config["ELASTIC_SEARCH_DB"])

    with codecs.open(args.out, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow([
            "ID", "Name", "Email", "Created", "Last Updated",
            "Updated Since Create?", "Has Reset Token", "Reset Token Expired?"
        ])

        for account in publishers_with_journals():

            has_reset = account.reset_token is not None
            is_expired = account.is_reset_expired(
            ) if has_reset is True else ""

            updated_since_create = account.created_timestamp < account.last_updated_timestamp

            writer.writerow([
                account.id, account.name, account.email, account.created_date,
def make_csv(start, end, out):
    q = {
        "query": {
            "bool": {
                "must": [{
                    "range": {
                        "last_updated": {
                            "gte": start,
                            "lte": end
                        }
                    }
                }, {
                    "term": {
                        "admin.application_status": "rejected"
                    }
                }]
            }
        }
    }

    conn = esprit.raw.Connection(app.config.get("ELASTIC_SEARCH_HOST"),
                                 app.config.get("ELASTIC_SEARCH_DB"))

    with codecs.open(out, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow([
            "ID", "Last Updated", "Is Quick Reject?", "Suggester Name",
            "Suggester Email", "Owner ID", "Owner Name", "Owner Email",
            "Title", "ISSNS", "Quick Reject Note Date", "All Notes"
        ])
        for source in esprit.tasks.scroll(conn, "suggestion", q):
            application = models.Suggestion(**source)

            qr_note = None
            for n in application.notes:
                if ": This application was rejected with the reason '" in n.get(
                        "note"):
                    qr_note = n
                    break

            owner_id = application.owner
            owner = None
            if owner_id is not None:
                owner = models.Account.pull(owner_id)

            summary = {
                "id":
                application.id,
                "last_updated":
                application.last_updated,
                "quick_reject_note_date":
                "" if qr_note is None else qr_note.get("date"),
                "suggester_name":
                application.suggester.get("name"),
                "suggester_email":
                application.suggester.get("email"),
                "owner":
                application.owner,
                "owner_name":
                owner.name if owner is not None else "",
                "owner_email":
                owner.email if owner is not None else "",
                "notes":
                "\n".join([
                    n.get("date") + " - " + n.get("note")
                    for n in application.notes
                ]),
                "is_quick_reject":
                "True" if qr_note is not None else "",
                "title":
                application.bibjson().title,
                "issns":
                ",".join(application.bibjson().issns())
            }

            writer.writerow([
                summary["id"], summary["last_updated"],
                summary["is_quick_reject"], summary["suggester_name"],
                summary["suggester_email"], summary["owner"],
                summary["owner_name"], summary["owner_email"],
                summary["title"], summary["issns"],
                summary["quick_reject_note_date"], summary["notes"]
            ])
    def run(self):
        job = self.background_job
        params = job.params

        # Set up the files we need to run this task - a dir to place the report, and a place to write the article csv
        outdir = self.get_param(params, "outdir", "article_duplicates_" + dates.today())
        job.add_audit_message("Saving reports to " + outdir)
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        # Location for our interim CSV file of articles
        tmpdir = self.get_param(params, "tmpdir", 'tmp_article_duplicate_report')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        tmp_csvname = self.get_param(params, "article_csv", False)
        tmp_csvpath, total = self._make_csv_dump(tmpdir, tmp_csvname)

        # Initialise our reports
        global_reportfile = 'duplicate_articles_global_' + dates.today() + '.csv'
        global_reportpath = os.path.join(outdir, global_reportfile)
        f = codecs.open(global_reportpath, "wb", "utf-8")
        global_report = UnicodeWriter(f)
        header = ["article_id", "article_created", "article_doi", "article_fulltext", "article_owner", "article_issns", "article_in_doaj", "n_matches", "match_type", "match_id", "match_created", "match_doi", "match_fulltext", "match_owner", "match_issns", "match_in_doaj", "owners_match", "titles_match", "article_title", "match_title"]
        global_report.writerow(header)

        noids_reportfile = 'noids_' + dates.today() + '.csv'
        noids_reportpath = os.path.join(outdir, noids_reportfile)
        g = codecs.open(noids_reportpath, "wb", "utf-8")
        noids_report = UnicodeWriter(g)
        header = ["article_id", "article_created", "article_owner", "article_issns", "article_in_doaj"]
        noids_report.writerow(header)

        # Record the sets of duplicated articles
        global_matches = []

        a_count = 0

        articleService = DOAJ.articleService()

        # Read back in the article csv file we created earlier
        with codecs.open(tmp_csvpath, 'rb', 'utf-8') as t:
            article_reader = UnicodeReader(t)

            start = datetime.now()
            estimated_finish = ""
            for a in article_reader:
                if a_count > 1 and a_count % 100 == 0:
                    n = datetime.now()
                    diff = (n - start).total_seconds()
                    expected_total = ((diff / a_count) * total)
                    estimated_finish = dates.format(dates.after(start, expected_total))
                a_count += 1

                article = models.Article(_source={'id': a[0], 'created_date': a[1], 'bibjson': {'identifier': json.loads(a[2]), 'link': json.loads(a[3]), 'title': a[4]}, 'admin': {'in_doaj': json.loads(a[5])}})

                # Get the global duplicates
                try:
                    global_duplicates = articleService.discover_duplicates(article, owner=None, results_per_match_type=10000)
                except exceptions.DuplicateArticleException:
                    # this means the article did not have any ids that could be used for deduplication
                    owner = self._lookup_owner(article)
                    noids_report.writerow([article.id, article.created_date, owner, ','.join(article.bibjson().issns()), article.is_in_doaj()])
                    continue

                dupcount = 0
                if global_duplicates:

                    # Look up an article's owner
                    owner = self._lookup_owner(article)

                    # Deduplicate the DOI and fulltext duplicate lists
                    s = set([article.id] + [d.id for d in global_duplicates.get('doi', []) + global_duplicates.get('fulltext', [])])
                    dupcount = len(s) - 1
                    if s not in global_matches:
                        self._write_rows_from_duplicates(article, owner, global_duplicates, global_report)
                        global_matches.append(s)

                app.logger.debug('{0}/{1} {2} {3} {4} {5}'.format(a_count, total, article.id, dupcount, len(global_matches), estimated_finish))

        job.add_audit_message('{0} articles processed for duplicates. {1} global duplicate sets found.'.format(a_count, len(global_matches)))
        f.close()
        g.close()

        # Delete the transient temporary files.
        shutil.rmtree(tmpdir)

        # Email the reports if that parameter has been set.
        send_email = self.get_param(params, "email", False)
        if send_email:
            archive_name = "article_duplicates_" + dates.today()
            email_archive(outdir, archive_name)
            job.add_audit_message("email alert sent")
        else:
            job.add_audit_message("no email alert sent")
Exemplo n.º 19
0
                    break


if __name__ == "__main__":

    import argparse
    parser = argparse.ArgumentParser()

    parser.add_argument("-o", "--out",
                        help="Output directory into which reports should be made (will be created if it doesn't exist)",
                        default="publisher_emails_" + dates.today())
    parser.add_argument("-e", "--email",
                        help="Send zip archived reports to email addresses configured via REPORTS_EMAIL_TO in settings",
                        action='store_true')
    args = parser.parse_args()

    outdir = args.out
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    filename = "publisher_emails_in_doaj_" + dates.today() + ".csv"
    outfile = os.path.join(outdir, filename)
    with codecs.open(outfile, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["Account ID", "Email Address"])
        for p in publishers_with_journals():
            writer.writerow([p.id, p.email])

    if args.email:
        email_archive(outdir, "publisher_emails_in_doaj_" + dates.today())
def applications_inconsistencies(outfile_later, outfile_missing, conn):
    with codecs.open(outfile_later, "wb",
                     "utf-8") as f, codecs.open(outfile_missing, "wb",
                                                "utf-8") as g:

        out_later = csv.writer(f)
        out_later.writerow([
            "Application ID", "Application Last Updated",
            "Latest Provenance Recorded", "Difference"
        ])

        out_missing = UnicodeWriter(g)
        out_missing.writerow([
            "Application ID", "Application Last Manual Update",
            "Latest Provenance Record", "ISSNs", "Title"
        ])

        counter = 0
        for result in esprit.tasks.scroll(conn, "suggestion", keepalive="45m"):
            counter += 1
            application = Suggestion(**result)
            print counter, application.id

            # Part 1 - later provenance records exist
            latest_prov = Provenance.get_latest_by_resource_id(application.id)
            if latest_prov is not None:
                lustamp = adjust_timestamp(application.last_updated_timestamp,
                                           APP_TIMEZONE_CUTOFF)
                created = latest_prov.created_date
                pstamp = latest_prov.created_timestamp
                td = pstamp - lustamp
                diff = td.total_seconds()

                if diff > THRESHOLD:
                    out_later.writerow([
                        application.id, application.last_updated, created, diff
                    ])

            # Part 2 - missing journals
            if application.application_status == constants.APPLICATION_STATUS_ACCEPTED:
                missing = False

                # find the matching journals by issn or by title
                matching_journals = Journal.find_by_issn(
                    application.bibjson().issns())
                if len(matching_journals) == 0:
                    # Have another go, find by title
                    matching_journals = Journal.find_by_title(
                        application.bibjson().title)

                # if there are no matching journals, it is missing.
                if len(matching_journals) == 0:
                    missing = True
                else:
                    # if there are matching journals, find out if any of them are in the doaj.  If none, then journal is still missing
                    those_in_doaj = len(
                        [j for j in matching_journals if j.is_in_doaj()])
                    if those_in_doaj == 0:
                        missing = True

                # if the journal is missing, record it
                if missing:
                    created = ""
                    if latest_prov is not None:
                        created = latest_prov.created_date
                    out_missing.writerow([
                        application.id, application.last_manual_update,
                        created, " ".join(application.bibjson().issns()),
                        application.bibjson().title
                    ])

        print "processed", counter, "suggestions"
Exemplo n.º 21
0
                journal = models.Journal.pull(j)
                if journal is not None and journal.is_in_doaj():
                    yield publisher_account
                    break


if __name__ == "__main__":

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--out", help="output file path")
    args = parser.parse_args()

    if not args.out:
        print "Please specify an output file path with the -o option"
        parser.print_help()
        exit()

    conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"],
                                      None, app.config["ELASTIC_SEARCH_DB"])

    with codecs.open(args.out, "wb", "utf-8") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["ID", "Name", "Email", "Created", "Last Updated"])

        for account in publishers_with_journals():
            writer.writerow([
                account.id, account.name, account.email, account.created_date,
                account.last_updated
            ])