Exemplo n.º 1
0
    def make_citation_data(tmp_destination):
        """Because citations are paginated and because as of this moment there
        are 11M citations in the database, we cannot provide users with a bulk
        data file containing the complete objects for every citation.

        Instead of doing that, we dump our citation table with a shell command,
        which provides people with compact and reasonable data they can import.
        """
        mkdir_p(tmp_destination)

        logger.info('   - Copying the citations table to disk...')

        # This command calls the psql COPY command and requests that it dump
        # the citation table to disk as a compressed CSV.
        default_db = settings.DATABASES['default']
        os.system(
            '''PGPASSWORD="******" psql -c "COPY \\"search_opinionscited\\" (citing_opinion_id, cited_opinion_id) to stdout DELIMITER ',' CSV HEADER" --host {host} --dbname {database} --username {username} | gzip > {destination}'''
            .format(
                password=default_db['PASSWORD'],
                host=default_db['HOST'],
                database=default_db['NAME'],
                username=default_db['USER'],
                destination=join(tmp_destination, 'all.csv.gz'),
            ))
        logger.info('   - Table created successfully.')
def cp_pr_file_to_bulk_dir(result_file_path, chown):
    """Copy the pagerank file to the bulk data directory for public analysis.
    """
    mkdir_p(settings.BULK_DATA_DIR)  # The dir doesn't always already exist.
    shutil.copy(result_file_path, settings.BULK_DATA_DIR)
    if chown:
        user_info = pwd.getpwnam('www-data')
        os.chown(
            settings.BULK_DATA_DIR + 'external_pagerank',
            user_info.pw_uid,
            user_info.pw_gid,
        )
def cp_pr_file_to_bulk_dir(result_file_path, chown):
    """Copy the pagerank file to the bulk data directory for public analysis.
    """
    mkdir_p(settings.BULK_DATA_DIR)  # The dir doesn't always already exist.
    shutil.copy(result_file_path, settings.BULK_DATA_DIR)
    if chown:
        user_info = pwd.getpwnam('www-data')
        os.chown(
            settings.BULK_DATA_DIR + 'external_pagerank',
            user_info.pw_uid,
            user_info.pw_gid,
        )
Exemplo n.º 4
0
def get_from_ia(reporter, volume):
    """
    Download cases from internet archive via case law and write them to
    disk.

    :param reporter: (str) Requires a reporter abbreviation to identify
    cases to download as used by IA.  (Ex. T.C. => tc)
    :param volume: (int) Specific volume number of the reporter.  If blank
    function will cycle through all volumes of the reporter on IA.
    :return: None
    """

    reporter_key = ".".join(['law.free.cap', reporter])

    # Checks that the returned reporter is the requested one.
    # Ex. searching for Mich will return both Mich-app. and Mich.
    for ia_identifier in search_items(reporter_key):
        ia_key = ia_identifier['identifier']
        if ia_key.split(".")[3] != reporter:
            continue

        # Checks if we requested a specific volume of the
        # reporter and if so skips all other volumes of that reporter
        ia_volume = ia_key.split(".")[-1]
        if volume is not None:
            if volume != ia_volume:
                continue

        for item in get_files(ia_key):
            if "json.json" in item.name:
                continue

            if "json" in item.name:
                url = "https://archive.org/download/%s/%s" % (
                    ia_key, item.name)
                file_path = os.path.join(settings.MEDIA_ROOT,
                                         'harvard_corpus',
                                         '%s' % ia_key,
                                         '%s' % item.name,
                                         )
                directory = file_path.rsplit("/", 1)[0]
                if os.path.exists(file_path):
                    logger.info("Already captured: %s", url)
                    continue

                logger.info("Capturing: %s", url)
                mkdir_p(directory)
                data = requests.get(url, timeout=10).json()
                with open(file_path, 'w') as outfile:
                    json.dump(data, outfile, indent=2)
Exemplo n.º 5
0
def swap_archives(obj_type_str, bulk_dir, tmp_bulk_dir):
    """Swap out new archives, clobbering the old, if present"""
    tmp_gz_dir = join(tmp_bulk_dir, obj_type_str)
    final_gz_dir = join(bulk_dir, obj_type_str)
    mkdir_p(final_gz_dir)
    for f in glob.glob(join(tmp_gz_dir, '*.tar*')):
        shutil.move(f, join(final_gz_dir, os.path.basename(f)))

    # Move the info files too.
    try:
        shutil.copy2(join(tmp_gz_dir, 'info.json'),
                     join(final_gz_dir, 'info.json'))
    except IOError as e:
        if e.errno == 2:
            # No such file/directory
            pass
        else:
            raise
Exemplo n.º 6
0
def swap_archives(obj_type_str, bulk_dir, tmp_bulk_dir):
    """Swap out new archives, clobbering the old, if present"""
    tmp_gz_dir = join(tmp_bulk_dir, obj_type_str)
    final_gz_dir = join(bulk_dir, obj_type_str)
    mkdir_p(final_gz_dir)
    for f in glob.glob(join(tmp_gz_dir, "*.tar*")):
        shutil.move(f, join(final_gz_dir, os.path.basename(f)))

    # Move the info files too.
    try:
        shutil.copy2(join(tmp_gz_dir, "info.json"),
                     join(final_gz_dir, "info.json"))
    except IOError as e:
        if e.errno == 2:
            # No such file/directory
            pass
        else:
            raise
Exemplo n.º 7
0
def swap_archives(obj_type_str):
    """Swap out new archives, clobbering the old, if present"""
    mkdir_p(join(settings.BULK_DATA_DIR, obj_type_str))
    path_to_gz_files = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                            '*.tar*')
    for f in glob.glob(path_to_gz_files):
        shutil.move(
            f, join(settings.BULK_DATA_DIR, obj_type_str, os.path.basename(f)))

    # Move the info files too.
    try:
        shutil.copy2(
            join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, 'info.json'),
            join(settings.BULK_DATA_DIR, obj_type_str, 'info.json'))
    except IOError as e:
        if e.errno == 2:
            # No such file/directory
            pass
        else:
            raise
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ops = queryset_generator(
            Opinion.objects.exclude(
                Q(html="") | Q(html=None),
                Q(html_lawbox="") | Q(html_lawbox=None),
                Q(html_columbia="") | Q(html_columbia=None),
            ))

        for op in ops:
            content = render_to_string("simple_opinion.html", {"o": op})
            output_dir = os.path.join(
                options["output_directory"],
                str(op.cluster.date_filed.year),
                str(op.cluster.date_filed.month),
                str(op.cluster.date_filed.day),
            )
            mkdir_p(output_dir)
            output_path = os.path.join(output_dir, f"{op.pk}.html")
            with open(output_path, "w") as f:
                f.write(content.encode())
Exemplo n.º 9
0
    def make_citation_data(tmp_destination, obj_type_str):
        """Because citations are paginated and because as of this moment there
        are 11M citations in the database, we cannot provide users with a bulk
        data file containing the complete objects for every citation.

        Instead of doing that, we dump our citation table with a shell command,
        which provides people with compact and reasonable data they can import.
        """
        mkdir_p(tmp_destination)

        print '   - Copying the citations table to disk...'

        # This command calls the psql COPY command and requests that it dump
        # the citation table to disk as a compressed CSV.
        os.system(
            '''PGPASSWORD="******" psql -c "COPY \\"search_opinionscited\\" (citing_opinion_id, cited_opinion_id) to stdout DELIMITER ',' CSV HEADER" -d {database} --username {username} | gzip > {destination}'''.format(
                password=settings.DATABASES['default']['PASSWORD'],
                database=settings.DATABASES['default']['NAME'],
                username=settings.DATABASES['default']['USER'],
                destination=join(tmp_destination, 'all.csv.gz'),
            )
        )
        print '   - Table created successfully.'
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ops = queryset_generator(Opinion.objects.exclude(
            Q(html='') | Q(html=None),
            Q(html_lawbox='') | Q(html_lawbox=None),
            Q(html_columbia='') | Q(html_columbia=None),
        ))

        for op in ops:
            content = render_to_string('simple_opinion.html', {
                'o': op,
            })
            output_dir = os.path.join(
                options['output_directory'],
                str(op.cluster.date_filed.year),
                str(op.cluster.date_filed.month),
                str(op.cluster.date_filed.day),
            )
            mkdir_p(output_dir)
            output_path = os.path.join(output_dir, '%s.html' % op.pk)
            with open(output_path, 'w') as f:
                f.write(content.encode('utf-8'))
Exemplo n.º 11
0
def swap_archives(obj_type_str):
    """Swap out new archives, clobbering the old, if present"""
    mkdir_p(join(settings.BULK_DATA_DIR, obj_type_str))
    path_to_gz_files = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                            '*.tar*')
    for f in glob.glob(path_to_gz_files):
        shutil.move(
            f,
            join(settings.BULK_DATA_DIR, obj_type_str, os.path.basename(f))
        )

    # Move the info files too.
    try:
        shutil.copy2(
            join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, 'info.json'),
            join(settings.BULK_DATA_DIR, obj_type_str, 'info.json')
        )
    except IOError as e:
        if e.errno == 2:
            # No such file/directory
            pass
        else:
            raise
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)

        ops = queryset_generator(
            Opinion.objects.exclude(
                Q(html='') | Q(html=None),
                Q(html_lawbox='') | Q(html_lawbox=None),
                Q(html_columbia='') | Q(html_columbia=None),
            ))

        for op in ops:
            content = render_to_string('simple_opinion.html', {
                'o': op,
            })
            output_dir = os.path.join(
                options['output_directory'],
                str(op.cluster.date_filed.year),
                str(op.cluster.date_filed.month),
                str(op.cluster.date_filed.day),
            )
            mkdir_p(output_dir)
            output_path = os.path.join(output_dir, '%s.html' % op.pk)
            with open(output_path, 'w') as f:
                f.write(content.encode('utf-8'))
Exemplo n.º 13
0
def get_from_ia(reporter, volume):
    """
    Download cases from internet archive via case law and write them to
    disk.

    :param reporter: (str) Requires a reporter abbreviation to identify
    cases to download as used by IA.  (Ex. T.C. => tc)
    :param volume: (int) Specific volume number of the reporter.  If blank
    function will cycle through all volumes of the reporter on IA.
    :return: None
    """

    logger.info("Creating IA session...")
    access_key = settings.IA_ACCESS_KEY
    secret_key = settings.IA_SECRET_KEY
    ia_session = ia.get_session(
        {"s3": {
            "access": access_key,
            "secret": secret_key,
        }})

    reporter_key = ".".join(["law.free.cap", reporter])

    # Checks that the returned reporter is the requested one.
    # Ex. searching for Mich will return both Mich-app. and Mich.
    for ia_identifier in ia_session.search_items(reporter_key):
        logger.info("Got ia identifier: %s" % ia_identifier)
        ia_key = ia_identifier["identifier"]
        if ia_key.split(".")[3] != reporter:
            continue

        # Checks if we requested a specific volume of the
        # reporter and if so skips all other volumes of that reporter
        ia_volume = ia_key.split(".")[-1]
        if volume is not None:
            if volume != ia_volume:
                continue

        ia_item = ia_session.get_item(ia_key)
        for item in ia_item.get_files():
            logger.info("Got item with name: %s" % item.name)
            if "json.json" in item.name:
                continue

            if "json" not in item.name:
                continue

            url = "https://archive.org/download/%s/%s" % (
                ia_key,
                item.name,
            )
            file_path = os.path.join(
                settings.MEDIA_ROOT,
                "harvard_corpus",
                "%s" % ia_key,
                "%s" % item.name,
            )
            directory = file_path.rsplit("/", 1)[0]
            if os.path.exists(file_path):
                logger.info("Already captured: %s", url)
                continue

            logger.info("Capturing: %s", url)
            mkdir_p(directory)
            data = requests.get(url, timeout=10).json()
            with open(file_path, "w") as outfile:
                json.dump(data, outfile, indent=2)
Exemplo n.º 14
0
    def handle(self, *args, **options):
        courts = Court.objects.all()

        # Make the main bulk files
        kwargs_list = [
            {
                'obj_type_str': 'clusters',
                'obj_type': OpinionCluster,
                'court_attr': 'docket.court_id',
                'serializer': OpinionClusterSerializer,
            },
            {
                'obj_type_str': 'opinions',
                'obj_type': Opinion,
                'court_attr': 'cluster.docket.court_id',
                'serializer': OpinionSerializer,
            },
            {
                'obj_type_str': 'dockets',
                'obj_type': Docket,
                'court_attr': 'court_id',
                'serializer': DocketSerializer,
            },
            {
                'obj_type_str': 'courts',
                'obj_type': Court,
                'court_attr': None,
                'serializer': CourtSerializer,
            },
            {
                'obj_type_str': 'audio',
                'obj_type': Audio,
                'court_attr': 'docket.court_id',
                'serializer': AudioSerializer,
            },
            {
                'obj_type_str': 'people',
                'obj_type': Person,
                'court_attr': None,
                'serializer': PersonSerializer,
            },
            {
                'obj_type_str': 'schools',
                'obj_type': School,
                'court_attr': None,
                'serializer': SchoolSerializer,
            },
            {
                'obj_type_str': 'positions',
                'obj_type': Position,
                'court_attr': None,
                'serializer': PositionSerializer,
            },
            {
                'obj_type_str': 'retention-events',
                'obj_type': RetentionEvent,
                'court_attr': None,
                'serializer': RetentionEventSerializer,
            },
            {
                'obj_type_str': 'educations',
                'obj_type': Education,
                'court_attr': None,
                'serializer': EducationSerializer,
            },
            {
                'obj_type_str': 'politicial-affiliations',
                'obj_type': PoliticalAffiliation,
                'court_attr': None,
                'serializer': PoliticalAffiliationSerializer,
            },
        ]

        print 'Starting bulk file creation with %s celery tasks...' % \
              len(kwargs_list)
        for kwargs in kwargs_list:
            make_bulk_data_and_swap_it_in.delay(courts, kwargs)

        # Make the citation bulk data
        obj_type_str = 'citations'
        print ' - Creating bulk data CSV for citations...'
        tmp_destination = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str)
        final_destination = join(settings.BULK_DATA_DIR, obj_type_str)
        self.make_citation_data(tmp_destination, obj_type_str)
        print "   - Swapping in the new citation archives..."

        mkdir_p(join(settings.BULK_DATA_DIR, obj_type_str))
        shutil.move(
            join(tmp_destination, 'all.csv.gz'),
            join(final_destination, 'all.csv.gz'),
        )

        print 'Done.\n'
Exemplo n.º 15
0
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr, serializer):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(
                join(
                    settings.BULK_DATA_DIR,
                    'tmp',
                    obj_type_str,
                    court.pk,
                ))

    if last_good_date is not None:
        print "   - Incremental data found. Assuming it's good and using it..."
        qs = obj_type.objects.filter(date_modified__gte=last_good_date)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()

    if qs.count() == 0:
        print "   - No %s-type items in the DB or none that have changed. All done here." % obj_type_str
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META[
            'SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           deepgetattr(item, court_attr), '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print '   - %s %s json files created.' % (i, obj_type_str)

        history.mark_success_and_save()
        return i
Exemplo n.º 16
0
    def handle(self, *args, **options):
        courts = Court.objects.all()

        # Make the main bulk files
        kwargs_list = [
            {
                'obj_type_str': 'clusters',
                'obj_type': OpinionCluster,
                'court_attr': 'docket.court_id',
                'serializer': OpinionClusterSerializer,
            },
            {
                'obj_type_str': 'opinions',
                'obj_type': Opinion,
                'court_attr': 'cluster.docket.court_id',
                'serializer': OpinionSerializer,
            },
            {
                'obj_type_str': 'dockets',
                'obj_type': Docket,
                'court_attr': 'court_id',
                'serializer': DocketSerializer,
            },
            {
                'obj_type_str': 'courts',
                'obj_type': Court,
                'court_attr': None,
                'serializer': CourtSerializer,
            },
            {
                'obj_type_str': 'audio',
                'obj_type': Audio,
                'court_attr': 'docket.court_id',
                'serializer': AudioSerializer,
            },
            # has_beta_api_access
            # {
            #     'obj_type_str': 'judges',
            #     'obj_type': Judge,
            #     'court_attr': None,
            #     'serializer': JudgeSerializer,
            # },
            # {
            #     'obj_type_str': 'positions',
            #     'obj_type': Position,
            #     'court_attr': None,
            #     'serializer': PositionSerializer,
            # },
            # {
            #     'obj_type_str': 'politicians',
            #     'obj_type': Politician,
            #     'court_attr': None,
            #     'serializer': PoliticianSerializer,
            # },
            # {
            #     'obj_type_str': 'retention-events',
            #     'obj_type': RetentionEvent,
            #     'court_attr': None,
            #     'serializer': RetentionEventSerializer,
            # },
            # {
            #     'obj_type_str': 'educations',
            #     'obj_type': Education,
            #     'court_attr': None,
            #     'serializer': EducationSerializer,
            # },
            # {
            #     'obj_type_str': 'schools',
            #     'obj_type': School,
            #     'court_attr': None,
            #     'serializer': SchoolSerializer,
            # },
            # {
            #     'obj_type_str': 'careers',
            #     'obj_type': Career,
            #     'court_attr': None,
            #     'serializer': CareerSerializer,
            # },
            # {
            #     'obj_type_str': 'titles',
            #     'obj_type': Title,
            #     'court_attr': None,
            #     'serializer': TitleSerializer,
            # },
            # {
            #     'obj_type_str': 'politicial-affiliations',
            #     'obj_type': PoliticalAffiliation,
            #     'court_attr': None,
            #     'serializer': PoliticalAffiliationSerializer,
            # },
        ]

        print 'Starting bulk file creation with %s celery tasks...' % \
              len(kwargs_list)
        for kwargs in kwargs_list:
            make_bulk_data_and_swap_it_in.delay(courts, kwargs)

        # Make the citation bulk data
        obj_type_str = 'citations'
        print ' - Creating bulk data CSV for citations...'
        tmp_destination = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str)
        final_destination = join(settings.BULK_DATA_DIR, obj_type_str)
        self.make_citation_data(tmp_destination, obj_type_str)
        print "   - Swapping in the new citation archives..."

        mkdir_p(join(settings.BULK_DATA_DIR, obj_type_str))
        shutil.move(
            join(tmp_destination, 'all.csv.gz'),
            join(final_destination, 'all.csv.gz'),
        )

        print 'Done.\n'
Exemplo n.º 17
0
 def save_to_disk(self):
     mkdir_p(self.path.rsplit('/', 1)[0])
     with open(self.path, 'w') as f:
         json.dump(self.json, f, indent=2)
Exemplo n.º 18
0
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr,
                       serializer, bulk_dir):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.

    We deal with two kinds of bulk data. The first is jurisdiction-centric, in
    which we want to make bulk data for that particular jurisdiction, such as
    opinions or PACER data, or whatever. The second is non-jurisdiction-
    specific, like people or schools. For jurisdiction-specific data, we make
    jurisdiction directories to put the data into. Otherwise, we do not.

    :param courts: Court objects that you expect to make data for.
    :param obj_type_str: A string to use for the directory name of a type of
    data. For example, for clusters, it's 'clusters'.
    :param obj_class: The actual class to make a bulk data for.
    :param court_attr: A string that can be used to find the court attribute
    on an object. For example, on clusters, this is currently docket.court_id.
    :param serializer: A DRF serializer to use to generate the data.
    :param bulk_dir: A directory to place the serialized JSON data into.

    :returns int: The number of items generated
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str, bulk_dir)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                bulk_dir,
                obj_type_str,
                court.pk,
            ))
    else:
        # Make a directory for the object type.
        mkdir_p(join(bulk_dir, obj_type_str))

    if last_good_date is not None:
        print("   - Incremental data found. Assuming it's good and using it...")
        qs = obj_class.objects.filter(date_modified__gte=last_good_date)
    else:
        print("   - Incremental data not found. Working from scratch...")
        qs = obj_class.objects.all()

    if qs.count() == 0:
        print("   - No %s-type items in the DB or none that have changed. All "
              "done here." % obj_type_str)
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META['SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.META['SERVER_PORT'] = '443'  # Else, it's 80
        r.META['wsgi.url_scheme'] = 'https'  # Else, it's http.
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            if i % 1000 == 0:
                print("Completed %s items so far." % i)
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(bulk_dir, obj_type_str, deepgetattr(item, court_attr),
                           '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(bulk_dir, obj_type_str, '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print ('   - %s %s json files created.' % (i, obj_type_str))

        history.mark_success_and_save()
        return i
Exemplo n.º 19
0
 def save_to_disk(self):
     mkdir_p(self.path.rsplit('/', 1)[0])
     with open(self.path, 'w') as f:
         json.dump(self.json, f, indent=2)
Exemplo n.º 20
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        courts = Court.objects.all()

        kwargs_list = [
            {
                "obj_type_str": "clusters",
                "obj_class": OpinionCluster,
                "court_attr": "docket.court_id",
                "serializer": OpinionClusterSerializer,
            },
            {
                "obj_type_str": "opinions",
                "obj_class": Opinion,
                "court_attr": "cluster.docket.court_id",
                "serializer": OpinionSerializer,
            },
            {
                "obj_type_str": "dockets",
                "obj_class": Docket,
                "court_attr": "court_id",
                "serializer": DocketSerializer,
            },
            {
                "obj_type_str": "courts",
                "obj_class": Court,
                "court_attr": None,
                "serializer": CourtSerializer,
            },
            {
                "obj_type_str": "audio",
                "obj_class": Audio,
                "court_attr": "docket.court_id",
                "serializer": AudioSerializer,
            },
            {
                "obj_type_str": "people",
                "obj_class": Person,
                "court_attr": None,
                "serializer": PersonSerializer,
            },
            {
                "obj_type_str": "schools",
                "obj_class": School,
                "court_attr": None,
                "serializer": SchoolSerializer,
            },
            {
                "obj_type_str": "positions",
                "obj_class": Position,
                "court_attr": None,
                "serializer": PositionSerializer,
            },
            {
                "obj_type_str": "retention-events",
                "obj_class": RetentionEvent,
                "court_attr": None,
                "serializer": RetentionEventSerializer,
            },
            {
                "obj_type_str": "educations",
                "obj_class": Education,
                "court_attr": None,
                "serializer": EducationSerializer,
            },
            {
                "obj_type_str": "politicial-affiliations",
                "obj_class": PoliticalAffiliation,
                "court_attr": None,
                "serializer": PoliticalAffiliationSerializer,
            },
        ]

        logger.info(
            "Starting bulk file creation with %s celery tasks..."
            % len(kwargs_list)
        )
        for kwargs in kwargs_list:
            make_bulk_data_and_swap_it_in(
                courts, settings.BULK_DATA_DIR, kwargs
            )

        # Make the citation bulk data
        obj_type_str = "citations"
        logger.info(" - Creating bulk data CSV for citations...")
        tmp_destination = join(settings.BULK_DATA_DIR, "tmp", obj_type_str)
        final_destination = join(settings.BULK_DATA_DIR, obj_type_str)
        self.make_citation_data(tmp_destination)
        logger.info("   - Swapping in the new citation archives...")

        mkdir_p(final_destination)
        shutil.move(
            join(tmp_destination, "all.csv.gz"),
            join(final_destination, "all.csv.gz"),
        )

        logger.info("Done.\n")
Exemplo n.º 21
0
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr, serializer,
                       bulk_dir):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified since the last good date.

    We deal with two kinds of bulk data. The first is jurisdiction-centric, in
    which we want to make bulk data for that particular jurisdiction, such as
    opinions or PACER data, or whatever. The second is non-jurisdiction-
    specific, like people or schools. For jurisdiction-specific data, we make
    jurisdiction directories to put the data into. Otherwise, we do not.

    :param courts: Court objects that you expect to make data for.
    :param obj_type_str: A string to use for the directory name of a type of
    data. For example, for clusters, it's 'clusters'.
    :param obj_class: The actual class to make a bulk data for.
    :param court_attr: A string that can be used to find the court attribute
    on an object. For example, on clusters, this is currently docket.court_id.
    :param serializer: A DRF serializer to use to generate the data.
    :param bulk_dir: A directory to place the serialized JSON data into.

    :returns int: The number of items generated
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str, bulk_dir)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                bulk_dir,
                obj_type_str,
                court.pk,
            ))
    else:
        # Make a directory for the object type.
        mkdir_p(join(bulk_dir, obj_type_str))

    if last_good_date is not None:
        print(
            "   - Incremental data found. Assuming it's good and using it...")
        qs = obj_class.objects.filter(date_modified__gte=last_good_date)
    else:
        print("   - Incremental data not found. Working from scratch...")
        qs = obj_class.objects.all()

    if qs.count() == 0:
        print("   - No %s-type items in the DB or none that have changed. All "
              "done here." % obj_type_str)
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META[
            "SERVER_NAME"] = "www.courtlistener.com"  # Else, it's testserver
        r.META["SERVER_PORT"] = "443"  # Else, it's 80
        r.META["wsgi.url_scheme"] = "https"  # Else, it's http.
        r.version = "v3"
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            if i % 1000 == 0:
                print("Completed %s items so far." % i)
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type="application/json; indent=2",
            )

            if court_attr is not None:
                loc = join(
                    bulk_dir,
                    obj_type_str,
                    deepgetattr(item, court_attr),
                    "%s.json" % item.pk,
                )
            else:
                # A non-jurisdiction-centric object.
                loc = join(bulk_dir, obj_type_str, "%s.json" % item.pk)

            with open(loc, "wb") as f:
                f.write(json_str)
            i += 1

        print("   - %s %s json files created." % (i, obj_type_str))

        history.mark_success_and_save()
        return i
Exemplo n.º 22
0
    def handle(self, *args, **options):
        super(Command, self).handle(*args, **options)
        courts = Court.objects.all()

        kwargs_list = [
            {
                'obj_type_str': 'clusters',
                'obj_class': OpinionCluster,
                'court_attr': 'docket.court_id',
                'serializer': OpinionClusterSerializer,
            },
            {
                'obj_type_str': 'opinions',
                'obj_class': Opinion,
                'court_attr': 'cluster.docket.court_id',
                'serializer': OpinionSerializer,
            },
            {
                'obj_type_str': 'dockets',
                'obj_class': Docket,
                'court_attr': 'court_id',
                'serializer': DocketSerializer,
            },
            {
                'obj_type_str': 'courts',
                'obj_class': Court,
                'court_attr': None,
                'serializer': CourtSerializer,
            },
            {
                'obj_type_str': 'audio',
                'obj_class': Audio,
                'court_attr': 'docket.court_id',
                'serializer': AudioSerializer,
            },
            {
                'obj_type_str': 'people',
                'obj_class': Person,
                'court_attr': None,
                'serializer': PersonSerializer,
            },
            {
                'obj_type_str': 'schools',
                'obj_class': School,
                'court_attr': None,
                'serializer': SchoolSerializer,
            },
            {
                'obj_type_str': 'positions',
                'obj_class': Position,
                'court_attr': None,
                'serializer': PositionSerializer,
            },
            {
                'obj_type_str': 'retention-events',
                'obj_class': RetentionEvent,
                'court_attr': None,
                'serializer': RetentionEventSerializer,
            },
            {
                'obj_type_str': 'educations',
                'obj_class': Education,
                'court_attr': None,
                'serializer': EducationSerializer,
            },
            {
                'obj_type_str': 'politicial-affiliations',
                'obj_class': PoliticalAffiliation,
                'court_attr': None,
                'serializer': PoliticalAffiliationSerializer,
            },
        ]

        logger.info('Starting bulk file creation with %s celery tasks...' %
                    len(kwargs_list))
        for kwargs in kwargs_list:
            make_bulk_data_and_swap_it_in(courts, settings.BULK_DATA_DIR,
                                          kwargs)

        # Make the citation bulk data
        obj_type_str = 'citations'
        logger.info(' - Creating bulk data CSV for citations...')
        tmp_destination = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str)
        final_destination = join(settings.BULK_DATA_DIR, obj_type_str)
        self.make_citation_data(tmp_destination)
        logger.info("   - Swapping in the new citation archives...")

        mkdir_p(final_destination)
        shutil.move(
            join(tmp_destination, 'all.csv.gz'),
            join(final_destination, 'all.csv.gz'),
        )

        logger.info('Done.\n')
Exemplo n.º 23
0
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr,
                       serializer):
    """Write all items to disk as json files inside directories named by
    jurisdiction.

    The main trick is that we identify if we are creating a bulk archive
    from scratch. If so, we iterate over everything. If not, we only
    iterate over items that have been modified in the last 32 days because
    it's assumed that the bulk files are generated once per month.
    """
    # Are there already bulk files?
    history = BulkJsonHistory(obj_type_str)
    last_good_date = history.get_last_good_date()
    history.add_current_attempt_and_save()

    if court_attr is not None:
        # Create a directory for every jurisdiction, if they don't already
        # exist. This does not clobber.
        for court in courts:
            mkdir_p(join(
                settings.BULK_DATA_DIR,
                'tmp',
                obj_type_str,
                court.pk,
            ))

    if last_good_date is not None:
        print "   - Incremental data found. Assuming it's good and using it..."
        qs = obj_type.objects.filter(date_modified__gte=last_good_date)
    else:
        print "   - Incremental data not found. Working from scratch..."
        qs = obj_type.objects.all()

    if qs.count() == 0:
        print "   - No %s-type items in the DB or none that have changed. All done here." % obj_type_str
        history.mark_success_and_save()
        return 0
    else:
        if type(qs[0].pk) == int:
            item_list = queryset_generator(qs)
        else:
            # Necessary for Court objects, which don't have ints for ids.
            item_list = qs

        i = 0
        renderer = JSONRenderer()
        r = RequestFactory().request()
        r.META['SERVER_NAME'] = 'www.courtlistener.com'  # Else, it's testserver
        r.version = 'v3'
        r.versioning_scheme = URLPathVersioning()
        context = dict(request=r)
        for item in item_list:
            json_str = renderer.render(
                serializer(item, context=context).data,
                accepted_media_type='application/json; indent=2',
            )

            if court_attr is not None:
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           deepgetattr(item, court_attr), '%s.json' % item.pk)
            else:
                # A non-jurisdiction-centric object.
                loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str,
                           '%s.json' % item.pk)

            with open(loc, 'wb') as f:
                f.write(json_str)
            i += 1

        print '   - %s %s json files created.' % (i, obj_type_str)

        history.mark_success_and_save()
        return i