Python process_row примеры использования

Язык программирования: Python

Пространство имен/Пакет: openapc_toolkit

Метод/Функция: process_row

Примеров на hotexamples.com: 3

Python process_row - 3 примера найдено. Это лучшие примеры Python кода для openapc_toolkit.process_row, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: apc_csv_processing.py Проект: olurolur/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-b",
                        "--bypass-cert-verification",
                        action="store_true",
                        help=ARG_HELP_STRINGS["bypass"])
    parser.add_argument("-d",
                        "--offline_doaj",
                        help=ARG_HELP_STRINGS["offline_doaj"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-f",
                        "--force",
                        action="store_true",
                        help=ARG_HELP_STRINGS["force"])
    parser.add_argument("-i",
                        "--ignore-header",
                        action="store_true",
                        help=ARG_HELP_STRINGS["ignore_header"])
    parser.add_argument("-j",
                        "--force-header",
                        action="store_true",
                        help=ARG_HELP_STRINGS["force_header"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-u",
                        "--add-unknown-columns",
                        action="store_true",
                        help=ARG_HELP_STRINGS["unknown_columns"])
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        help=ARG_HELP_STRINGS["verbose"])
    parser.add_argument("-institution",
                        "--institution_column",
                        type=int,
                        help=ARG_HELP_STRINGS["institution"])
    parser.add_argument("-period",
                        "--period_column",
                        type=int,
                        help=ARG_HELP_STRINGS["period"])
    parser.add_argument("-doi",
                        "--doi_column",
                        type=int,
                        help=ARG_HELP_STRINGS["doi"])
    parser.add_argument("-euro",
                        "--euro_column",
                        type=int,
                        help=ARG_HELP_STRINGS["euro"])
    parser.add_argument("-is_hybrid",
                        "--is_hybrid_column",
                        type=int,
                        help=ARG_HELP_STRINGS["is_hybrid"])
    parser.add_argument("-publisher",
                        "--publisher_column",
                        type=int,
                        help=ARG_HELP_STRINGS["publisher"])
    parser.add_argument("-journal_full_title",
                        "--journal_full_title_column",
                        type=int,
                        help=ARG_HELP_STRINGS["journal_full_title"])
    parser.add_argument("-issn",
                        "--issn_column",
                        type=int,
                        help=ARG_HELP_STRINGS["issn"])
    parser.add_argument("-url",
                        "--url_column",
                        type=int,
                        help=ARG_HELP_STRINGS["url"])
    parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"])

    args = parser.parse_args()
    enc = None  # CSV file encoding

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(ANSIColorFormatter())
    bufferedHandler = BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            print "locale '{}' not found, normalized to '{}'".format(
                args.locale, norm)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            print "Using locale", loc
        except locale.Error as loce:
            print "Setting locale to " + norm + " failed: " + loce.message
            sys.exit()

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print("Encoding '{}' found in Python's codec collection " +
                  "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    result = oat.analyze_csv_file(args.csv_file)
    if result["success"]:
        csv_analysis = result["data"]
        print csv_analysis
    else:
        print result["error_msg"]
        sys.exit()

    if enc is None:
        enc = csv_analysis.enc
    dialect = csv_analysis.dialect
    has_header = csv_analysis.has_header or args.force_header

    if enc is None:
        print("Error: No encoding given for CSV file and automated " +
              "detection failed. Please set the encoding manually via the " +
              "--enc argument")
        sys.exit()

    doaj_offline_analysis = None
    if args.offline_doaj:
        if os.path.isfile(args.offline_doaj):
            doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj)
        else:
            oat.print_r("Error: " + args.offline_doaj + " does not seem "
                        "to be a file!")

    csv_file = open(args.csv_file, "r")
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    first_row = reader.next()
    num_columns = len(first_row)
    print "\nCSV file has {} columns.".format(num_columns)

    csv_file.seek(0)
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)

    column_map = OrderedDict([
        ("institution",
         CSVColumn("institution", CSVColumn.MANDATORY,
                   args.institution_column)),
        ("period", CSVColumn("period", CSVColumn.MANDATORY,
                             args.period_column)),
        ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)),
        ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)),
        ("is_hybrid",
         CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)),
        ("publisher",
         CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)),
        ("journal_full_title",
         CSVColumn("journal_full_title", CSVColumn.OPTIONAL,
                   args.journal_full_title_column)),
        ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)),
        ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)),
        ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE,
                                      None)),
        ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)),
        ("indexed_in_crossref",
         CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)),
        ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)),
        ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)),
        ("ut", CSVColumn("ut", CSVColumn.NONE, None)),
        ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)),
        ("doaj", CSVColumn("doaj", CSVColumn.NONE, None))
    ])

    # Do not quote the values in the 'period' and 'euro' columns
    quotemask = [
        True,
        False,
        False,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
    ]

    header = None
    if has_header:
        for row in reader:
            if not row:  # Skip empty lines
                continue
            header = row  # First non-empty row should be the header
            if args.ignore_header:
                print "Skipping header analysis due to command line argument."
                break
            else:
                print "\n    *** Analyzing CSV header ***\n"
            for (index, item) in enumerate(header):
                column_type = oat.get_column_type_from_whitelist(item)
                if column_type is not None and column_map[
                        column_type].index is None:
                    column_map[column_type].index = index
                    column_map[column_type].column_name = item
                    print("Found column named '{}' at index {}, " +
                          "assuming this to be the {} column.").format(
                              item, index, column_type)
            break

    print "\n    *** Starting heuristical analysis ***\n"
    for row in reader:
        if not row:  # Skip empty lines
            # We analyze the first non-empty line, a possible header should
            # have been processed by now.
            continue
        column_candidates = {"doi": [], "period": [], "euro": []}
        for (index, entry) in enumerate(row):
            if index in [csvcolumn.index for csvcolumn in column_map.values()]:
                # Skip columns already assigned
                continue
            entry = entry.strip()
            # Search for a DOI
            if column_map['doi'].index is None:
                if oat.DOI_RE.match(entry):
                    column_id = str(index)
                    # identify column either numerical or by column header
                    if header:
                        column_id += " ('" + header[index] + "')"
                    print("The entry in column {} looks like a " +
                          "DOI: {}").format(column_id, entry)
                    column_candidates['doi'].append(index)
                    continue
            # Search for a potential year string
            if column_map['period'].index is None:
                try:
                    maybe_period = int(entry)
                    now = datetime.date.today().year
                    # Should be a wide enough margin
                    if maybe_period >= 2000 and maybe_period <= now + 2:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print("The entry in column {} looks like a " +
                              "potential period: {}").format(column_id, entry)
                        column_candidates['period'].append(index)
                        continue
                except ValueError:
                    pass
            # Search for a potential monetary amount
            if column_map['euro'].index is None:
                try:
                    maybe_euro = locale.atof(entry)
                    # Are there APCs above 6000€ ??
                    if maybe_euro >= 10 and maybe_euro <= 6000:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print("The entry in column {} looks like a " +
                              "potential euro amount: {}").format(
                                  column_id, entry)
                        column_candidates['euro'].append(index)
                        continue
                except ValueError:
                    pass
        for column_type, candidates in column_candidates.iteritems():
            if column_map[column_type].index is not None:
                continue
            if len(candidates) > 1:
                print("Could not reliably identify the '" + column_type +
                      "' column - more than one possible candiate!")
            elif len(candidates) < 1:
                print "No candidate found for column '" + column_type + "'!"
            else:
                index = candidates.pop()
                column_map[column_type].index = index
                if header:
                    column_id = header[index]
                    column_map[column_type].column_name = column_id
                else:
                    column_id = index
                print("Assuming column '{}' to be the '{}' " +
                      "column.").format(column_id, column_type)
                column_map[column_type].index = index
        break

    # Wrap up: Check if there any mandatory column types left which have not
    # yet been identified - we cannot continue in that case (unless forced).
    unassigned = filter(
        lambda
        (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None,
        column_map.iteritems())
    if unassigned:
        for item in unassigned:
            print "The {} column is still unidentified.".format(item[0])
        if header:
            print "The CSV header is:\n" + dialect.delimiter.join(header)
        if not args.force:
            print("ERROR: We cannot continue because not all mandatory " +
                  "column types in the CSV file could be automatically " +
                  "identified. There are 2 ways to fix this:")
            if not header:
                print(
                    "1) Add a header row to your file and identify the " +
                    "column(s) by assigning them an appropiate column name.")
            else:
                print(
                    "1) Identify the missing column(s) by assigning them " +
                    "a different column name in the CSV header (You can " +
                    "use the column name(s) mentioned in the message above)")
            print("2) Use command line parameters when calling this script " +
                  "to identify the missing columns (use -h for help) ")
            sys.exit()
        else:
            print("WARNING: Not all mandatory column types in the CSV file " +
                  "could be automatically identified - forced to continue.")

    print "\n    *** CSV file analysis summary ***\n"

    index_dict = {csvc.index: csvc for csvc in column_map.values()}

    for index in range(num_columns):
        column_name = ""
        if header:
            column_name = header[index]
        if index in index_dict:
            column = index_dict[index]
            msg = "column number {} ({}) is the {} column '{}'".format(
                index, column_name, column.requirement, column.column_type)
            if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]:
                oat.print_g(msg)
            else:
                oat.print_b(msg)
        else:
            if args.add_unknown_columns:
                msg = (
                    "column number {} ({}) is an unknown column, it will be " +
                    "appended to the generated CSV file")
                oat.print_y(msg.format(index, column_name))
                if not column_name:
                    # Use a generic name
                    column_name = "unknown"
                while column_name in column_map.keys():
                    # TODO: Replace by a numerical, increasing suffix
                    column_name += "_"
                column_map[column_name] = CSVColumn(column_name,
                                                    CSVColumn.NONE, index)
            else:
                msg = (
                    "column number {} ({}) is an unknown column, it will be " +
                    "ignored")
                oat.print_y(msg.format(index, column_name))

    print ""
    for column in column_map.values():
        if column.index is None:
            msg = "The {} column '{}' could not be identified."
            print msg.format(column.requirement, column.column_type)

    # Check for unassigned optional column types. We can continue but should
    # issue a warning as all entries will need a valid DOI in this case.
    unassigned = filter(
        lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None,
        column_map.iteritems())
    if unassigned:
        print("\nWARNING: Not all optional column types could be " +
              "identified. Metadata aggregation is still possible, but " +
              "every entry in the CSV file will need a valid DOI.")

    start = raw_input("\nStart metadata aggregation? (y/n):")
    while start not in ["y", "n"]:
        start = raw_input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    print "\n    *** Starting metadata aggregation ***\n"

    enriched_content = []

    csv_file.seek(0)
    reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc)
    header_processed = False
    row_num = 0

    for row in reader:
        row_num += 1
        if not row:
            continue  # skip empty lines
        if not header_processed:
            header_processed = True
            enriched_content.append(column_map.keys())
            if has_header:
                # If the CSV file has a header, we are currently there - skip it
                # to get to the first data row
                continue
        if args.start and args.start > row_num:
            continue
        if args.end and args.end < row_num:
            continue
        print "---Processing line number " + str(row_num) + "---"
        enriched_row = oat.process_row(row, row_num, column_map, num_columns,
                                       doaj_offline_analysis,
                                       args.bypass_cert_verification)
        enriched_content.append(enriched_row)

    csv_file.close()

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True)
        writer.write_rows(enriched_content)

    if not bufferedHandler.buffer:
        oat.print_g("Metadata enrichment successful, no errors occured")
    else:
        oat.print_r("There were errors during the enrichment process:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()

Пример #2

Показать файл

Файл: apc_csv_processing.py Проект: hauptbibliothekzurich/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-O", "--offsetting_mode", help=ARG_HELP_STRINGS["offsetting"])
    parser.add_argument("-b", "--bypass-cert-verification", action="store_true",
                        help=ARG_HELP_STRINGS["bypass"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-f", "--force", action="store_true",
                        help=ARG_HELP_STRINGS["force"])
    parser.add_argument("-i", "--ignore-header", action="store_true",
                        help=ARG_HELP_STRINGS["ignore_header"])
    parser.add_argument("-j", "--force-header", action="store_true",
                        help=ARG_HELP_STRINGS["force_header"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-a", "--add-unknown-columns", action="store_true",
                        help=ARG_HELP_STRINGS["unknown_columns"])
    parser.add_argument("-d", "--dialect", choices=["excel", "excel-tab", "unix"],
                        help=ARG_HELP_STRINGS["dialect"])
    parser.add_argument("-v", "--verbose", action="store_true",
                        help=ARG_HELP_STRINGS["verbose"])
    parser.add_argument("-o", "--overwrite", action="store_true",
                        help=ARG_HELP_STRINGS["overwrite"])
    parser.add_argument("-u", "--update", action="store_true",
                        help=ARG_HELP_STRINGS["update"])
    parser.add_argument("-r", "--round_monetary", action="store_true",
                        help=ARG_HELP_STRINGS["round_monetary"])
    parser.add_argument("--no-crossref", action="store_true",
                        help=ARG_HELP_STRINGS["no_crossref"])
    parser.add_argument("--no-pubmed", action="store_true",
                        help=ARG_HELP_STRINGS["no_pubmed"])
    parser.add_argument("--no-doaj", action="store_true",
                        help=ARG_HELP_STRINGS["no_doaj"])
    parser.add_argument("-institution", "--institution_column", type=int,
                        help=ARG_HELP_STRINGS["institution"])
    parser.add_argument("-period", "--period_column", type=int,
                        help=ARG_HELP_STRINGS["period"])
    parser.add_argument("-doi", "--doi_column", type=int,
                        help=ARG_HELP_STRINGS["doi"])
    parser.add_argument("-euro", "--euro_column", type=int,
                        help=ARG_HELP_STRINGS["euro"])
    parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int,
                        help=ARG_HELP_STRINGS["is_hybrid"])
    parser.add_argument("-publisher", "--publisher_column", type=int,
                        help=ARG_HELP_STRINGS["publisher"])
    parser.add_argument("-journal_full_title", "--journal_full_title_column",
                        type=int, help=ARG_HELP_STRINGS["journal_full_title"])
    parser.add_argument("-book_title", "--book_title_column",
                        type=int, help=ARG_HELP_STRINGS["book_title"])
    parser.add_argument("-issn", "--issn_column",
                        type=int, help=ARG_HELP_STRINGS["issn"])
    parser.add_argument("-isbn", "--isbn_column",
                        type=int, help=ARG_HELP_STRINGS["isbn"])
    parser.add_argument("-backlist_oa", "--backlist_oa_column",
                        type=int, help=ARG_HELP_STRINGS["backlist_oa"])
    parser.add_argument("-additional_isbns", "--additional_isbn_columns", type=int, nargs='+',
                        help=ARG_HELP_STRINGS["additional_isbns"])
    parser.add_argument("-url", "--url_column",
                        type=int, help=ARG_HELP_STRINGS["url"])
    parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"])

    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()

    enc = None # CSV file encoding
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            oat.print_g(msg)
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    result = oat.analyze_csv_file(args.csv_file, enc=enc)
    if result["success"]:
        csv_analysis = result["data"]
        print(csv_analysis)
    else:
        print(result["error_msg"])
        sys.exit()

    if args.dialect:
        dialect = args.dialect
        oat.print_g('Dialect sniffing results ignored, using built-in CSV dialect "' + dialect + '"')
    else:
        dialect = csv_analysis.dialect

    if enc is None:
        enc = csv_analysis.enc
    has_header = csv_analysis.has_header or args.force_header

    if enc is None:
        print("Error: No encoding given for CSV file and automated " +
              "detection failed. Please set the encoding manually via the " +
              "--enc argument")
        sys.exit()

    csv_file = open(args.csv_file, "r", encoding=enc)
    reader = csv.reader(csv_file, dialect=dialect)

    first_row = next(reader)
    num_columns = len(first_row)
    print("\nCSV file has {} columns.".format(num_columns))

    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect=dialect)

    if args.update and args.overwrite:
        oat.print_r("Error: Either use the -u or the -o option, not both.")
        sys.exit()

    if args.overwrite:
        for column in OVERWRITE_STRATEGY.keys():
             OVERWRITE_STRATEGY[column] = CSVColumn.OW_ALWAYS
    elif not args.update:
        for column in OVERWRITE_STRATEGY.keys():
             OVERWRITE_STRATEGY[column] = CSVColumn.OW_ASK

    additional_isbn_columns = []
    if args.additional_isbn_columns:
        for index in args.additional_isbn_columns:
            if index > num_columns:
                msg = "Error: Additional ISBN column index {} exceeds number of columns ({})."
                oat.print_r(msg.format(index, num_columns))
                sys.exit()
            else:
                additional_isbn_columns.append(index)

    column_map = {
        "institution": CSVColumn("institution", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.institution_column, overwrite=OVERWRITE_STRATEGY["institution"]),
        "period": CSVColumn("period",{"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.period_column, overwrite=OVERWRITE_STRATEGY["period"]),
        "euro": CSVColumn("euro", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.euro_column, overwrite=OVERWRITE_STRATEGY["euro"]),
        "doi": CSVColumn("doi", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.doi_column, overwrite=OVERWRITE_STRATEGY["doi"]),
        "is_hybrid": CSVColumn("is_hybrid", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.NONE}, args.is_hybrid_column, overwrite=OVERWRITE_STRATEGY["is_hybrid"]),
        "publisher": CSVColumn("publisher", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.publisher_column, overwrite=OVERWRITE_STRATEGY["publisher"]),
        "journal_full_title": CSVColumn("journal_full_title", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.journal_full_title_column, overwrite=OVERWRITE_STRATEGY["journal_full_title"]),
        "issn": CSVColumn("issn", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.issn_column, overwrite=OVERWRITE_STRATEGY["issn"]),
        "issn_print": CSVColumn("issn_print", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_print"]),
        "issn_electronic": CSVColumn("issn_electronic", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_electronic"]),
        "issn_l": CSVColumn("issn_l", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_l"]),
        "license_ref": CSVColumn("license_ref", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE} , None, overwrite=OVERWRITE_STRATEGY["license_ref"]),
        "indexed_in_crossref": CSVColumn("indexed_in_crossref", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["indexed_in_crossref"]),
        "pmid": CSVColumn("pmid", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["pmid"]),
        "pmcid": CSVColumn("pmcid", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["pmcid"]),
        "ut": CSVColumn("ut", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["ut"]),
        "url": CSVColumn("url", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.url_column, overwrite=OVERWRITE_STRATEGY["url"]),
        "doaj": CSVColumn("doaj", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["doaj"]),
        "agreement": CSVColumn("agreement", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["agreement"]),
        "book_title": CSVColumn("book_title", {"articles": CSVColumn.NONE, "books": CSVColumn.RECOMMENDED}, args.book_title_column, overwrite=OVERWRITE_STRATEGY["book_title"]),
        "backlist_oa": CSVColumn("backlist_oa", {"articles": CSVColumn.NONE, "books": CSVColumn.MANDATORY}, args.backlist_oa_column, overwrite=OVERWRITE_STRATEGY["backlist_oa"]),
        "isbn": CSVColumn("isbn", {"articles": CSVColumn.NONE, "books": CSVColumn.BACKUP}, args.isbn_column, overwrite=OVERWRITE_STRATEGY["isbn"]),
        "isbn_print": CSVColumn("isbn_print", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["isbn_print"]),
        "isbn_electronic": CSVColumn("isbn_electronic", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["isbn_electronic"])
    }

    header = None
    if has_header:
        for row in reader:
            if not row: # Skip empty lines
                continue
            header = row # First non-empty row should be the header
            if args.ignore_header:
                print("Skipping header analysis due to command line argument.")
                break
            else:
                print("\n    *** Analyzing CSV header ***\n")
            for (index, item) in enumerate(header):
                if index in additional_isbn_columns:
                    msg = "Column named '{}' at index {} is designated as additional ISBN column"
                    print(msg.format(item, index))
                    continue
                column_type = oat.get_column_type_from_whitelist(item)
                if column_type is not None and column_map[column_type].index is None:
                    column_map[column_type].index = index
                    column_map[column_type].column_name = item
                    found_msg = ("Found column named '{}' at index {}, " +
                                 "assuming this to be the '{}' column.")
                    print(found_msg.format(item, index, column_type))
            break


    print("\n    *** Starting heuristical analysis ***\n")
    for row in reader:
        if not row: # Skip empty lines
            # We analyze the first non-empty line, a possible header should
            # have been processed by now.
            continue
        column_candidates = {
            "doi": [],
            "period": [],
            "euro": []
        }
        found_msg = "The entry in column {} looks like a potential {}: {}"
        for (index, entry) in enumerate(row):
            if index in [csvcolumn.index for csvcolumn in column_map.values()] + additional_isbn_columns:
                # Skip columns already assigned
                continue
            entry = entry.strip()
            # Search for a DOI
            if column_map['doi'].index is None:
                if oat.DOI_RE.match(entry):
                    column_id = str(index)
                    # identify column either numerically or by column header
                    if header:
                        column_id += " ('" + header[index] + "')"
                    print(found_msg.format(column_id, "DOI", entry))
                    column_candidates['doi'].append(index)
                    continue
            # Search for a potential year string
            if column_map['period'].index is None:
                try:
                    maybe_period = int(entry)
                    now = datetime.date.today().year
                    # Should be a wide enough margin
                    if maybe_period >= 2000 and maybe_period <= now + 2:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print(found_msg.format(column_id, "year", entry))
                        column_candidates['period'].append(index)
                        continue
                except ValueError:
                    pass
            # Search for a potential monetary amount
            if column_map['euro'].index is None:
                try:
                    maybe_euro = locale.atof(entry)
                    if maybe_euro >= 10 and maybe_euro <= 10000:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print (found_msg.format(column_id, "euro amount", entry))
                        column_candidates['euro'].append(index)
                        continue
                except ValueError:
                    pass
        for column_type, candidates in column_candidates.items():
            if column_map[column_type].index is not None:
                continue
            if len(candidates) > 1:
                print("Could not reliably identify the '" + column_type +
                      "' column - more than one possible candiate!")
            elif len(candidates) < 1:
                print("No candidate found for column '" + column_type + "'!")
            else:
                index = candidates.pop()
                column_map[column_type].index = index
                if header:
                    column_id = header[index]
                    column_map[column_type].column_name = column_id
                else:
                    column_id = index
                msg = "Assuming column '{}' to be the '{}' column."
                print(msg.format(column_id, column_type))
                column_map[column_type].index = index
        break

    print("\n    *** CSV file analysis summary ***\n")

    index_dict = {csvc.index: csvc for csvc in column_map.values()}

    for index in range(num_columns):
        column_name = ""
        if header:
            column_name = header[index]
        if index in index_dict:
            column = index_dict[index]
            msg = u"column number {} ({}) is the '{}' column ({})".format(
                index, column_name, column.column_type, column.get_req_description())
            print(msg)
        elif index in additional_isbn_columns:
            msg = u"column number {} ({}) is an additional ISBN column".format(index, column_name)
            oat.print_c(msg)
        else:
            if args.add_unknown_columns:
                msg = (u"column number {} ({}) is an unknown column, it will be " +
                       "appended to the generated CSV file")
                print(msg.format(index, column_name))
                if not column_name:
                    # Use a generic name
                    column_name = "unknown"
                while column_name in column_map.keys():
                    # TODO: Replace by a numerical, increasing suffix
                    column_name += "_"
                column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index)
            else:
                msg = (u"column number {} ({}) is an unknown column, it will be " +
                       "ignored")
                print(msg.format(index, column_name))

    print()
    for column in column_map.values():
        if column.index is None:
            msg = "The '{}' column could not be identified ({})"
            print(msg.format(column.column_type, column.get_req_description()))
    print()

    article_mand_missing = [x.column_type for x in column_map.values() if x.requirement["articles"] == CSVColumn.MANDATORY and x.index is None]
    article_back_missing = [x.column_type for x in column_map.values() if x.requirement["articles"] == CSVColumn.BACKUP and x.index is None]
    book_mand_missing = [x.column_type for x in column_map.values() if x.requirement["books"] == CSVColumn.MANDATORY and x.index is None]
    book_back_missing = [x.column_type for x in column_map.values() if x.requirement["books"] == CSVColumn.BACKUP and x.index is None]

    if article_mand_missing:
        msg = "Article enrichment is not possible - mandatory columns are missing ({})"
        oat.print_y(msg.format(", ".join(article_mand_missing)))
    elif article_back_missing:
        msg = "Article enrichment is possible, but backup columns are missing ({}) - each record will need a valid DOI"
        oat.print_b(msg.format(", ".join(article_back_missing)))
    else:
        oat.print_g("Article enrichment is possible with all backup columns in place")
    if book_mand_missing:
        msg = "Book enrichment is not possible - mandatory columns are missing ({})"
        oat.print_y(msg.format(", ".join(book_mand_missing)))
    elif book_back_missing:
        msg = "Book enrichment is possible, but backup columns are missing ({}) - each record will need a valid DOI"
        oat.print_b(msg.format(", ".join(book_back_missing)))
    else:
        oat.print_g("Book enrichment is possible with all backup columns in place")
    print()

    if article_mand_missing and book_mand_missing:
        if not args.force:
            oat.print_r("ERROR: Could not detect the minimum mandatory data set for any " + 
                  "publication type. There are 2 ways to fix this:")
            if not header:
                print("1) Add a header row to your file and identify the " +
                      "column(s) by assigning them an appropiate column name.")
            else:
                print("1) Identify the missing column(s) by assigning them " +
                      "a different column name in the CSV header (You can " +
                      "use the column name(s) mentioned in the message above)")
            print("2) Use command line parameters when calling this script " +
                  "to identify the missing columns (use -h for help) ")
            sys.exit()
        else:
            oat.print_y("WARNING: Could not detect the minimum mandatory data set for any " + 
                  "publication type - forced to continue.")

    start = input("\nStart metadata aggregation? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    print("\n    *** Starting metadata aggregation ***\n")

    enriched_content = {}
    for record_type, fields in oat.COLUMN_SCHEMAS.items():
        # add headers
        enriched_content[record_type] = {
            "count": 0,
            "content": [list(fields)]
        }

    if not os.path.isdir("tempfiles"):
        os.mkdir("tempfiles")
    isbn_handling = oat.ISBNHandling("tempfiles/ISBNRangeFile.xml")
    doab_analysis = oat.DOABAnalysis(isbn_handling, "tempfiles/DOAB.csv", verbose=False)
    doaj_analysis = oat.DOAJAnalysis("tempfiles/DOAJ.csv")

    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect=dialect)
    header_processed = False
    row_num = 0

    for row in reader:
        row_num += 1
        if not row:
            continue # skip empty lines
        if not header_processed:
            header_processed = True
            if has_header:
                # If the CSV file has a header, we are currently there - skip it
                # to get to the first data row
                continue
        if args.start and args.start > row_num:
            continue
        if args.end and args.end < row_num:
            continue
        print("---Processing line number " + str(row_num) + "---")
        result_type, enriched_row = oat.process_row(row, row_num, column_map, num_columns, additional_isbn_columns, doab_analysis, doaj_analysis,
                                                    args.no_crossref, args.no_pubmed,
                                                    args.no_doaj, args.round_monetary,
                                                    args.offsetting_mode)
        for record_type, value in enriched_content.items():
            if record_type == result_type:
                value["content"].append(enriched_row)
                value["count"] += 1
            else:
                empty_line = ["" for x in value["content"][0]]
                value["content"].append(empty_line)
    csv_file.close()

    for record_type, value in enriched_content.items():
        if value["count"] > 0:
            with open('out_' + record_type + '.csv', 'w') as out:
                writer = oat.OpenAPCUnicodeWriter(out, oat.OPENAPC_STANDARD_QUOTEMASK, 
                                                  True, True, True)
                writer.write_rows(value["content"])

    if not bufferedHandler.buffer:
        oat.print_g("Metadata enrichment successful, no errors occured")
    else:
        oat.print_r("There were errors during the enrichment process:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()

Пример #3

Показать файл

Файл: apc_csv_processing.py Проект: OpenAPC/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-O", "--offsetting_mode", help=ARG_HELP_STRINGS["offsetting"])
    parser.add_argument("-b", "--bypass-cert-verification", action="store_true",
                        help=ARG_HELP_STRINGS["bypass"])
    parser.add_argument("-d", "--offline_doaj",
                        help=ARG_HELP_STRINGS["offline_doaj"])
    parser.add_argument("-D", "--offline_doaj_download",
                        help=ARG_HELP_STRINGS["offline_doaj_download"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-f", "--force", action="store_true",
                        help=ARG_HELP_STRINGS["force"])
    parser.add_argument("-i", "--ignore-header", action="store_true",
                        help=ARG_HELP_STRINGS["ignore_header"])
    parser.add_argument("-j", "--force-header", action="store_true",
                        help=ARG_HELP_STRINGS["force_header"])
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-u", "--add-unknown-columns", action="store_true",
                        help=ARG_HELP_STRINGS["unknown_columns"])
    parser.add_argument("-v", "--verbose", action="store_true",
                        help=ARG_HELP_STRINGS["verbose"])
    parser.add_argument("-o", "--overwrite", action="store_true",
                        help=ARG_HELP_STRINGS["overwrite"])
    parser.add_argument("-r", "--round_monetary", action="store_true",
                        help=ARG_HELP_STRINGS["round_monetary"])
    parser.add_argument("--no-crossref", action="store_true",
                        help=ARG_HELP_STRINGS["no_crossref"])
    parser.add_argument("--no-pubmed", action="store_true",
                        help=ARG_HELP_STRINGS["no_pubmed"])
    parser.add_argument("--no-doaj", action="store_true",
                        help=ARG_HELP_STRINGS["no_doaj"])
    parser.add_argument("-institution", "--institution_column", type=int,
                        help=ARG_HELP_STRINGS["institution"])
    parser.add_argument("-period", "--period_column", type=int,
                        help=ARG_HELP_STRINGS["period"])
    parser.add_argument("-doi", "--doi_column", type=int,
                        help=ARG_HELP_STRINGS["doi"])
    parser.add_argument("-euro", "--euro_column", type=int,
                        help=ARG_HELP_STRINGS["euro"])
    parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int,
                        help=ARG_HELP_STRINGS["is_hybrid"])
    parser.add_argument("-publisher", "--publisher_column", type=int,
                        help=ARG_HELP_STRINGS["publisher"])
    parser.add_argument("-journal_full_title", "--journal_full_title_column",
                        type=int, help=ARG_HELP_STRINGS["journal_full_title"])
    parser.add_argument("-issn", "--issn_column",
                        type=int, help=ARG_HELP_STRINGS["issn"])
    parser.add_argument("-url", "--url_column",
                        type=int, help=ARG_HELP_STRINGS["url"])
    parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"])
    parser.add_argument("-q", "--quotemask", default="tfftttttttttttttttt",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-n", "--no-openapc-quote-rules", 
                        help=ARG_HELP_STRINGS["no_openapc_quote_rules"],
                        action="store_true", default=False)

    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)
    
    if args.offline_doaj and args.offline_doaj_download:
        oat.print_r("Error: Either use the -d or the -D option, not both.")
        sys.exit()

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()

    enc = None # CSV file encoding
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            oat.print_g(msg)
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    result = oat.analyze_csv_file(args.csv_file, enc=enc)
    if result["success"]:
        csv_analysis = result["data"]
        print(csv_analysis)
    else:
        print(result["error_msg"])
        sys.exit()

    if enc is None:
        enc = csv_analysis.enc
    dialect = csv_analysis.dialect
    has_header = csv_analysis.has_header or args.force_header

    if enc is None:
        print("Error: No encoding given for CSV file and automated " +
              "detection failed. Please set the encoding manually via the " +
              "--enc argument")
        sys.exit()

    reduced = args.quotemask.replace("f", "").replace("t", "")
    if len(reduced) > 0:
        print("Error: A quotemask may only contain the letters 't' and "  +
              "'f'!")
        sys.exit()
    mask = [True if x == "t" else False for x in args.quotemask]

    doaj_offline_analysis = None
    if args.offline_doaj:
        if os.path.isfile(args.offline_doaj):
            doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj)
        else:
            oat.print_r("Error: " + args.offline_doaj + " does not seem "
                        "to be a file!")
            sys.exit()
    elif args.offline_doaj_download:
        if os.path.isfile(args.offline_doaj_download):
            oat.print_r("Error: Target file '" + args.offline_doaj_download + "' already exists!")
            sys.exit()
        doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj_download, download=True)

    csv_file = open(args.csv_file, "r", encoding=enc)
    reader = csv.reader(csv_file, dialect=dialect)

    first_row = next(reader)
    num_columns = len(first_row)
    print("\nCSV file has {} columns.".format(num_columns))

    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect=dialect)

    if args.overwrite:
        ow_strategy = CSVColumn.OW_ALWAYS
    else:
        ow_strategy = CSVColumn.OW_ASK
        
    openapc_column_map = OrderedDict([
        ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column, overwrite=ow_strategy)),
        ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column, overwrite=ow_strategy)),
        ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column, overwrite=ow_strategy)),
        ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column, overwrite=ow_strategy)),
        ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column, overwrite=ow_strategy)),
        ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column, overwrite=ow_strategy)),
        ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL,
                                         args.journal_full_title_column, overwrite=ow_strategy)),
        ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column, overwrite=ow_strategy)),
        ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("issn_l", CSVColumn("issn_l", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("pmid", CSVColumn("pmid", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("ut", CSVColumn("ut", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column, overwrite=ow_strategy)),
        ("doaj", CSVColumn("doaj", CSVColumn.NONE, None, overwrite=ow_strategy))
    ])

    offsetting_column_map = OrderedDict([
        ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column, overwrite=ow_strategy)),
        ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column, overwrite=ow_strategy)),
        ("euro", CSVColumn("euro", CSVColumn.NONE, args.euro_column, overwrite=ow_strategy)),
        ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column, overwrite=ow_strategy)),
        ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column, overwrite=ow_strategy)),
        ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column, overwrite=ow_strategy)),
        ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL,
                                         args.journal_full_title_column, overwrite=ow_strategy)),
        ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column, overwrite=ow_strategy)),
        ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("issn_l", CSVColumn("issn_l", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("pmid", CSVColumn("pmid", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("ut", CSVColumn("ut", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column, overwrite=ow_strategy)),
        ("doaj", CSVColumn("doaj", CSVColumn.NONE, None, overwrite=ow_strategy)),
        ("agreement", CSVColumn("agreement", CSVColumn.NONE, None, overwrite=ow_strategy)),
    ])

    if args.offsetting_mode:
        column_map = offsetting_column_map
    else:
        column_map = openapc_column_map

    header = None
    if has_header:
        for row in reader:
            if not row: # Skip empty lines
                continue
            header = row # First non-empty row should be the header
            if args.ignore_header:
                print("Skipping header analysis due to command line argument.")
                break
            else:
                print("\n    *** Analyzing CSV header ***\n")
            for (index, item) in enumerate(header):
                column_type = oat.get_column_type_from_whitelist(item)
                if column_type is not None and column_map[column_type].index is None:
                    column_map[column_type].index = index
                    column_map[column_type].column_name = item
                    found_msg = ("Found column named '{}' at index {}, " +
                                 "assuming this to be the {} column.")
                    print(found_msg.format(item, index, column_type))
            break


    print("\n    *** Starting heuristical analysis ***\n")
    for row in reader:
        if not row: # Skip empty lines
            # We analyze the first non-empty line, a possible header should
            # have been processed by now.
            continue
        column_candidates = {
            "doi": [],
            "period": [],
            "euro": []
        }
        found_msg = "The entry in column {} looks like a potential {}: {}"
        for (index, entry) in enumerate(row):
            if index in [csvcolumn.index for csvcolumn in column_map.values()]:
                # Skip columns already assigned
                continue
            entry = entry.strip()
            # Search for a DOI
            if column_map['doi'].index is None:
                if oat.DOI_RE.match(entry):
                    column_id = str(index)
                    # identify column either numerically or by column header
                    if header:
                        column_id += " ('" + header[index] + "')"
                    print(found_msg.format(column_id, "DOI", entry))
                    column_candidates['doi'].append(index)
                    continue
            # Search for a potential year string
            if column_map['period'].index is None:
                try:
                    maybe_period = int(entry)
                    now = datetime.date.today().year
                    # Should be a wide enough margin
                    if maybe_period >= 2000 and maybe_period <= now + 2:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print(found_msg.format(column_id, "year", entry))
                        column_candidates['period'].append(index)
                        continue
                except ValueError:
                    pass
            # Search for a potential monetary amount
            if column_map['euro'].index is None:
                try:
                    maybe_euro = locale.atof(entry)
                    if maybe_euro >= 10 and maybe_euro <= 10000:
                        column_id = str(index)
                        if header:
                            column_id += " ('" + header[index] + "')"
                        print (found_msg.format(column_id, "euro amount", entry))
                        column_candidates['euro'].append(index)
                        continue
                except ValueError:
                    pass
        for column_type, candidates in column_candidates.items():
            if column_map[column_type].index is not None:
                continue
            if len(candidates) > 1:
                print("Could not reliably identify the '" + column_type +
                      "' column - more than one possible candiate!")
            elif len(candidates) < 1:
                print("No candidate found for column '" + column_type + "'!")
            else:
                index = candidates.pop()
                column_map[column_type].index = index
                if header:
                    column_id = header[index]
                    column_map[column_type].column_name = column_id
                else:
                    column_id = index
                msg = "Assuming column '{}' to be the '{}' column."
                print(msg.format(column_id, column_type))
                column_map[column_type].index = index
        break

    # Wrap up: Check if there any mandatory column types left which have not
    # yet been identified - we cannot continue in that case (unless forced).
    unassigned = [x for x in iter(column_map.items()) if x[1].requirement == CSVColumn.MANDATORY and x[1].index is None]
    if unassigned:
        for item in unassigned:
            print("The {} column is still unidentified.".format(item[0]))
        if header:
            print("The CSV header is:\n" + dialect.delimiter.join(header))
        if not args.force:
            print("ERROR: We cannot continue because not all mandatory " +
                  "column types in the CSV file could be automatically " +
                  "identified. There are 2 ways to fix this:")
            if not header:
                print("1) Add a header row to your file and identify the " +
                      "column(s) by assigning them an appropiate column name.")
            else:
                print("1) Identify the missing column(s) by assigning them " +
                      "a different column name in the CSV header (You can " +
                      "use the column name(s) mentioned in the message above)")
            print("2) Use command line parameters when calling this script " +
                  "to identify the missing columns (use -h for help) ")
            sys.exit()
        else:
            print("WARNING: Not all mandatory column types in the CSV file " +
                  "could be automatically identified - forced to continue.")

    print("\n    *** CSV file analysis summary ***\n")

    index_dict = {csvc.index: csvc for csvc in column_map.values()}

    for index in range(num_columns):
        column_name = ""
        if header:
            column_name = header[index]
        if index in index_dict:
            column = index_dict[index]
            msg = u"column number {} ({}) is the {} column '{}'".format(
                index, column_name, column.requirement, column.column_type)
            if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]:
                oat.print_g(msg)
            else:
                oat.print_b(msg)
        else:
            if args.add_unknown_columns:
                msg = (u"column number {} ({}) is an unknown column, it will be " +
                       "appended to the generated CSV file")
                oat.print_y(msg.format(index, column_name))
                if not column_name:
                    # Use a generic name
                    column_name = "unknown"
                while column_name in column_map.keys():
                    # TODO: Replace by a numerical, increasing suffix
                    column_name += "_"
                column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index)
            else:
                msg = (u"column number {} ({}) is an unknown column, it will be " +
                       "ignored")
                oat.print_y(msg.format(index, column_name))

    print()
    for column in column_map.values():
        if column.index is None:
            msg = "The {} column '{}' could not be identified."
            print(msg.format(column.requirement, column.column_type))


    # Check for unassigned optional column types. We can continue but should
    # issue a warning as all entries will need a valid DOI in this case.
    unassigned = filter(lambda k, v: v.requirement == CSVColumn.OPTIONAL and v.index is None,
                        column_map.items())
    if unassigned:
        print ("\nWARNING: Not all optional column types could be " +
               "identified. Metadata aggregation is still possible, but " +
               "every entry in the CSV file will need a valid DOI.")

    start = input("\nStart metadata aggregation? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    print("\n    *** Starting metadata aggregation ***\n")

    enriched_content = []

    csv_file.seek(0)
    reader = csv.reader(csv_file, dialect=dialect)
    header_processed = False
    row_num = 0

    for row in reader:
        row_num += 1
        if not row:
            continue # skip empty lines
        if not header_processed:
            header_processed = True
            enriched_content.append(list(column_map.keys()))
            if has_header:
                # If the CSV file has a header, we are currently there - skip it
                # to get to the first data row
                continue
        if args.start and args.start > row_num:
            continue
        if args.end and args.end < row_num:
            continue
        print("---Processing line number " + str(row_num) + "---")
        enriched_row = oat.process_row(row, row_num, column_map, num_columns,
                                       args.no_crossref, args.no_pubmed,
                                       args.no_doaj, doaj_offline_analysis, args.round_monetary,
                                       args.offsetting_mode)
        enriched_content.append(enriched_row)

    csv_file.close()

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, 
                                          not args.no_openapc_quote_rules, True,
                                          True)
        writer.write_rows(enriched_content)

    if not bufferedHandler.buffer:
        oat.print_g("Metadata enrichment successful, no errors occured")
    else:
        oat.print_r("There were errors during the enrichment process:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()