示例#1
0
def do_many(dir_path, limit=None, random_order=False, status_interval=100):
    """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml
    document, instantiates the associated model object, and saves the object.
    Prints/logs status updates and tracebacks instead of raising exceptions.

    :param dir_path: The directory.
    :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever).
    :param random_order: If true, will run through the directories and files in random order.
    :param status_interval: How often a status update will be given.
    """
    if limit:
        total = limit
    elif not random_order:
        print "Getting an initial file count ..."
        total = 0
        for _, _, file_names in os.walk(dir_path):
            total += len(fnmatch.filter(file_names, '*.xml'))
    else:
        total = None
    # go through the files, yielding parsed files and printing status updates as we go
    count = 0
    for path in file_generator(dir_path, random_order, limit):
        # grab the fallback text from the path if it's there
        court_fallback = ''
        matches = re.compile('data/([a-z_]+?/[a-z_]+?)/').findall(path)
        if matches:
            court_fallback = matches[0]
        # try to parse/save the case and print any exceptions with full tracebacks
        try:
            parsed = parse_file(path, court_fallback=court_fallback)
            make_and_save(parsed)
        except Exception as e:
            # print simple exception summaries for known problems
            if 'mismatched tag' in str(e):
                print "Mismatched tag exception encountered in file '%s':%s" % (
                    path, str(e).split(':', 1)[1])
            elif 'Failed to get a citation' in str(e):
                print "Exception in file '%s': %s" % (path, str(e))
            else:
                # otherwise, print generic traceback
                print
                print "Exception encountered in file '%s':" % path
                print traceback.format_exc()
                print
        # status update
        count += 1
        if count % status_interval == 0:
            if total:
                print "Finished %s out of %s files." % (count, total)
            else:
                print "Finished %s files." % count
def do_many(dir_path, limit=None, random_order=False, status_interval=100):
    """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents]. Parses each .xml
    document, instantiates the associated model object, and saves the object.
    Prints/logs status updates and tracebacks instead of raising exceptions.

    :param dir_path: The directory.
    :param limit: A limit on how many files to run through. If None, will run through all (or if random order, forever).
    :param random_order: If true, will run through the directories and files in random order.
    :param status_interval: How often a status update will be given.
    """
    if limit:
        total = limit
    elif not random_order:
        print "Getting an initial file count ..."
        total = 0
        for _, _, file_names in os.walk(dir_path):
            total += len(fnmatch.filter(file_names, '*.xml'))
    else:
        total = None
    # go through the files, yielding parsed files and printing status updates as we go
    count = 0
    for path in file_generator(dir_path, random_order, limit):
        # grab the fallback text from the path if it's there
        court_fallback = ''
        matches = re.compile('data/([a-z_]+?/[a-z_]+?)/').findall(path)
        if matches:
            court_fallback = matches[0]
        # try to parse/save the case and print any exceptions with full tracebacks
        try:
            parsed = parse_file(path, court_fallback=court_fallback)
            make_and_save(parsed)
        except Exception as e:
            # print simple exception summaries for known problems
            if 'mismatched tag' in str(e):
                print "Mismatched tag exception encountered in file '%s':%s" % (path, str(e).split(':', 1)[1])
            elif 'Failed to get a citation' in str(e):
                print "Exception in file '%s': %s" % (path, str(e))
            else:
                # otherwise, print generic traceback
                print
                print "Exception encountered in file '%s':" % path
                print traceback.format_exc()
                print
        # status update
        count += 1
        if count % status_interval == 0:
            if total:
                print "Finished %s out of %s files." % (count, total)
            else:
                print "Finished %s files." % count
示例#3
0
def do_many(dir_path, limit, random_order, status_interval, log_file, newcases,
            skipdupes, skip_newcases, avoid_nocites, courtdates, startfolder,
            startfile, debug):
    """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents].
    Parses each .xml document, instantiates the associated model object, and
    saves the object. Prints/logs status updates and tracebacks instead of
    raising exceptions.

    :param dir_path: The directory.
    :param limit: A limit on how many files to run through. If None, will run
    through all (or if random order, forever).
    :param random_order: If true, will run through the directories and files in
    random order.
    :param status_interval: How often a status update will be given.
    :param log_file: If not None, file paths that raise Exceptions will be
    logged to this file.
    :param newcases: If true, skip court-years that already have data.
    :param skipdupes: If true, skip duplicates.    
    :param skip_newcases: If true, skip cases imported under newcases.
    :param avoid_nocites: If true, skip cases from dates after any case with no cite.
    :param courtdates: If true, skip cases with dates before court established.
    :param startfolder: If not None, start on startfolder
    :param startfile: If not None, start on this file (for resuming)
    """
    if limit:
        total = limit
    elif not random_order:
        print("Getting an initial file count ...")
        print
        total = 0
        for _, _, file_names in os.walk(dir_path):
            total += len(fnmatch.filter(file_names, '*.xml'))
    else:
        total = None
    log = None
    if log_file:
        print("Logging problematic file paths to '%s' ..." % log_file)
        print
        log = logging.getLogger(__name__)
        log.setLevel(logging.INFO)
        log.addHandler(logging.FileHandler(log_file))
    # go through the files, yielding parsed files and printing status updates as
    # we go
    folders = glob(dir_path + '/*')
    folders.sort()
    count = 0

    # get earliest dates for each court
    if newcases:
        print('Only new cases: getting earliest dates by court.')
        min_dates = get_min_dates()
    else:
        min_dates = None

    if avoid_nocites:
        if newcases:
            raise Exception(
                "Cannot use both avoid_nocites and newcases options.")
        print(
            'Avoiding no cites: getting earliest dates by court with no citation.'
        )
        min_dates = get_min_nocite()

    if courtdates:
        start_dates = get_courtdates()
    else:
        start_dates = None

    # check if skipping first columbias cases

    if skip_newcases:
        skiplist = get_path_list()
    else:
        skiplist = set()

    # start/resume functionality
    if startfolder is not None:
        skipfolder = True
    else:
        skipfolder = False
    if startfile is not None:
        skipfile = True
    else:
        skipfile = False

    for folder in folders:
        if skipfolder:
            if startfolder is not None:
                checkfolder = folder.split('/')[-1]
                if checkfolder == startfolder:
                    skipfolder = False
                else:
                    continue
        print(folder)

        for path in file_generator(folder, random_order, limit):

            if skipfile:
                if startfile is not None:
                    checkfile = path.split('/')[-1]
                    if checkfile == startfile:
                        skipfile = False
                    else:
                        continue

            if path in skiplist:
                continue

            # skip cases in 'misc*' folders -- they are relatively different
            # than the other cases, so we'll deal with them later
            if 'miscellaneous_court_opinions' in path:
                continue

            print(path)

            # try to parse/save the case and print any exceptions with full
            # tracebacks
            try:
                parsed = parse_file(path)
                make_and_save(parsed, skipdupes, min_dates, start_dates, debug)
            except Exception as e:
                # log the file name
                if log:
                    log.info(path)
                # print simple exception summaries for known problems
                known = [
                    'mismatched tag', 'Failed to get a citation',
                    'Failed to find a court ID',
                    'null value in column "date_filed"', 'duplicate(s)'
                ]
                if any(k in str(e) for k in known):
                    print
                    print "Known exception in file '%s':" % path
                    print str(e)
                    print
                else:
                    # otherwise, print generic traceback
                    print
                    print "Unknown exception in file '%s':" % path
                    print traceback.format_exc()
                    print
        # status update
        count += 1
        if count % status_interval == 0:
            print
            if total:
                print "Finished %s out of %s files." % (count, total)
            else:
                print "Finished %s files." % count
            print
示例#4
0
def do_many(dir_path, limit, random_order, status_interval, log_file, 
            newcases, skipdupes, skip_newcases, avoid_nocites, courtdates,
            startfolder, startfile, debug):
    """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents].
    Parses each .xml document, instantiates the associated model object, and
    saves the object. Prints/logs status updates and tracebacks instead of
    raising exceptions.

    :param dir_path: The directory.
    :param limit: A limit on how many files to run through. If None, will run
    through all (or if random order, forever).
    :param random_order: If true, will run through the directories and files in
    random order.
    :param status_interval: How often a status update will be given.
    :param log_file: If not None, file paths that raise Exceptions will be
    logged to this file.
    :param newcases: If true, skip court-years that already have data.
    :param skipdupes: If true, skip duplicates.    
    :param skip_newcases: If true, skip cases imported under newcases.
    :param avoid_nocites: If true, skip cases from dates after any case with no cite.
    :param courtdates: If true, skip cases with dates before court established.
    :param startfolder: If not None, start on startfolder
    :param startfile: If not None, start on this file (for resuming)
    """
    if limit:
        total = limit
    elif not random_order:
        print ("Getting an initial file count ...")
        print
        total = 0
        for _, _, file_names in os.walk(dir_path):
            total += len(fnmatch.filter(file_names, '*.xml'))
    else:
        total = None
    log = None
    if log_file:
        print ("Logging problematic file paths to '%s' ..." % log_file)
        print
        log = logging.getLogger(__name__)
        log.setLevel(logging.INFO)
        log.addHandler(logging.FileHandler(log_file))
    # go through the files, yielding parsed files and printing status updates as
    # we go
    folders = glob(dir_path+'/*')
    folders.sort()
    count = 0

    # get earliest dates for each court
    if newcases:
        print('Only new cases: getting earliest dates by court.')
        min_dates = get_min_dates()
    else:
        min_dates = None
        
    if avoid_nocites:
        if newcases:
            raise Exception("Cannot use both avoid_nocites and newcases options.")
        print('Avoiding no cites: getting earliest dates by court with no citation.')
        min_dates = get_min_nocite()
        
    if courtdates:
        start_dates = get_courtdates()
    else:
        start_dates = None

    # check if skipping first columbias cases

    if skip_newcases:
        skiplist = get_path_list()
    else:
        skiplist = set()

    # start/resume functionality
    if startfolder is not None:
        skipfolder = True
    else:
        skipfolder = False
    if startfile is not None:
        skipfile = True
    else:
        skipfile = False

    for folder in folders:
        if skipfolder:
            if startfolder is not None:
                checkfolder = folder.split('/')[-1]
                if checkfolder == startfolder:
                    skipfolder = False
                else:
                    continue
        print(folder)

        for path in file_generator(folder, random_order, limit):

            if skipfile:
                if startfile is not None:
                    checkfile = path.split('/')[-1]
                    if checkfile == startfile:
                        skipfile = False
                    else:
                        continue

            if path in skiplist:
                continue

            # skip cases in 'misc*' folders -- they are relatively different
            # than the other cases, so we'll deal with them later
            if 'miscellaneous_court_opinions' in path:
                continue

            print(path)

            # try to parse/save the case and print any exceptions with full
            # tracebacks
            try:
                parsed = parse_file(path)
                make_and_save(parsed, skipdupes, min_dates, start_dates, debug)
            except Exception as e:
                # log the file name
                if log:
                    log.info(path)
                # print simple exception summaries for known problems
                known = [
                    'mismatched tag', 'Failed to get a citation',
                    'Failed to find a court ID',
                    'null value in column "date_filed"', 'duplicate(s)'
                ]
                if any(k in str(e) for k in known):
                    print
                    print "Known exception in file '%s':" % path
                    print str(e)
                    print
                else:
                    # otherwise, print generic traceback
                    print
                    print "Unknown exception in file '%s':" % path
                    print traceback.format_exc()
                    print
        # status update
        count += 1
        if count % status_interval == 0:
            print
            if total:
                print "Finished %s out of %s files." % (count, total)
            else:
                print "Finished %s files." % count
            print
示例#5
0
def do_many(
    dir_path,
    limit,
    random_order,
    status_interval,
    newcases,
    skipdupes,
    skip_newcases,
    avoid_nocites,
    courtdates,
    startfolder,
    startfile,
    debug,
):
    """Runs through a directory of the form /data/[state]/[sub]/.../[folders]/[.xml documents].
    Parses each .xml document, instantiates the associated model object, and
    saves the object. Prints/logs status updates and tracebacks instead of
    raising exceptions.

    :param dir_path: The directory.
    :param limit: A limit on how many files to run through. If None, will run
    through all (or if random order, forever).
    :param random_order: If true, will run through the directories and files in
    random order.
    :param status_interval: How often a status update will be given.
    :param newcases: If true, skip court-years that already have data.
    :param skipdupes: If true, skip duplicates.
    :param skip_newcases: If true, skip cases imported under newcases.
    :param avoid_nocites: If true, skip cases from dates after any case with no cite.
    :param courtdates: If true, skip cases with dates before court established.
    :param startfolder: If not None, start on startfolder
    :param startfile: If not None, start on this file (for resuming)
    """
    if limit:
        total = limit
    elif not random_order:
        logger.info("Getting an initial file count...")
        total = 0
        for _, _, file_names in os.walk(dir_path):
            total += len(fnmatch.filter(file_names, "*.xml"))
    else:
        total = None
    # go through the files, yielding parsed files and printing status updates as
    # we go
    folders = glob(f"{dir_path}/*")
    folders.sort()
    count = 0

    # get earliest dates for each court
    if newcases:
        logger.info("Only new cases: getting earliest dates by court.")
        min_dates = get_min_dates()
    else:
        min_dates = None

    if avoid_nocites:
        if newcases:
            raise Exception(
                "Cannot use both avoid_nocites and newcases options.")
        logger.info("Avoiding no cites: getting earliest dates by court with "
                    "no citation.")
        min_dates = get_min_nocite()

    if courtdates:
        start_dates = get_courtdates()
    else:
        start_dates = None

    # check if skipping first columbias cases

    if skip_newcases:
        skiplist = get_path_list()
    else:
        skiplist = set()

    # start/resume functionality
    if startfolder is not None:
        skipfolder = True
    else:
        skipfolder = False
    if startfile is not None:
        skipfile = True
    else:
        skipfile = False

    for folder in folders:
        if skipfolder:
            if startfolder is not None:
                checkfolder = folder.split("/")[-1]
                if checkfolder == startfolder:
                    skipfolder = False
                else:
                    continue
        logger.debug(folder)

        for path in file_generator(folder, random_order, limit):

            if skipfile:
                if startfile is not None:
                    checkfile = path.split("/")[-1]
                    if checkfile == startfile:
                        skipfile = False
                    else:
                        continue

            if path in skiplist:
                continue

            # skip cases in 'misc*' folders -- they are relatively different
            # than the other cases, so we'll deal with them later
            if "miscellaneous_court_opinions" in path:
                continue

            logger.debug(path)

            # try to parse/save the case and show any exceptions with full
            # tracebacks
            try:
                parsed = parse_file(path)
                make_and_save(parsed, skipdupes, min_dates, start_dates, debug)
            except Exception as e:
                logger.info(path)
                # show simple exception summaries for known problems
                known = [
                    "mismatched tag",
                    "Failed to get a citation",
                    "Failed to find a court ID",
                    'null value in column "date_filed"',
                    "duplicate(s)",
                ]
                if any(k in str(e) for k in known):
                    logger.info(f"Known exception in file '{path}':")
                    logger.info(str(e))
                else:
                    logger.info(f"Unknown exception in file '{path}':")
                    logger.info(traceback.format_exc())
        # status update
        count += 1
        if count % status_interval == 0:
            if total:
                logger.info(f"Finished {count} out of {total} files.")
            else:
                logger.info(f"Finished {count} files.")