def handle(self, *args, **options): """Identify parallel citations and save them as requested. This process proceeds in two phases. The first phase is to work through the entire corpus, identifying citations that occur very near to each other. These are considered parallel citations, and they are built into a graph data structure where citations are nodes and each parallel citation is an edge. The weight of each edge is determined by the number of times a parallel citation has been identified between two citations. This should solve problems like typos or other issues with our heuristic approach. The second phase of this process is to update the database with the high quality citations. This can only be done by matching the citations with actual items in the database and then updating them with parallel citations that are sufficiently likely to be good. """ super(Command, self).handle(*args, **options) no_option = (not any([options.get('doc_id'), options.get('all')])) if no_option: raise CommandError("Please specify if you want all items or a " "specific item.") if not options['update_database']: logger.info( "--update_database is not set. No changes will be made to the " "database." ) # Update Citation object to consider similar objects equal. self.monkey_patch_citation() logger.info("## Entering phase one: Building a network object of " "all citations.\n") q = Opinion.objects.all() if options.get('doc_id'): q = q.filter(pk__in=options['doc_id']) count = q.count() opinions = queryset_generator(q, chunksize=10000) node_count = edge_count = completed = 0 subtasks = [] for o in opinions: subtasks.append( # This will call the second function with the results from the # first. get_document_citations.s(o) | identify_parallel_citations.s() ) last_item = (count == completed + 1) if (completed % 50 == 0) or last_item: job = group(subtasks) result = job.apply_async().join() [self.add_groups_to_network(citation_groups) for citation_groups in result] subtasks = [] completed += 1 if completed % 250 == 0 or last_item: # Only do this once in a while. node_count = len(self.g.nodes()) edge_count = len(self.g.edges()) sys.stdout.write("\r Completed %s of %s. (%s nodes, %s edges)" % ( completed, count, node_count, edge_count, )) sys.stdout.flush() logger.info("\n\n## Entering phase two: Saving the best edges to " "the database.\n\n") for sub_graph in nx.connected_component_subgraphs(self.g): self.handle_subgraph(sub_graph, options) logger.info("\n\n## Done. Added %s new citations." % self.update_count) self.do_solr(options)
def handle(self, *args, **options): """Identify parallel citations and save them as requested. This process proceeds in two phases. The first phase is to work through the entire corpus, identifying citations that occur very near to each other. These are considered parallel citations, and they are built into a graph data structure where citations are nodes and each parallel citation is an edge. The weight of each edge is determined by the number of times a parallel citation has been identified between two citations. This should solve problems like typos or other issues with our heuristic approach. The second phase of this process is to update the database with the high quality citations. This can only be done by matching the citations with actual items in the database and then updating them with parallel citations that are sufficiently likely to be good. """ super(Command, self).handle(*args, **options) no_option = not any([options.get("doc_id"), options.get("all")]) if no_option: raise CommandError( "Please specify if you want all items or a specific item.") if not options["update_database"]: logger.info( "--update_database is not set. No changes will be made to the " "database.") logger.info("## Entering phase one: Building a network object of " "all citations.\n") q = Opinion.objects.all() if options.get("doc_id"): q = q.filter(pk__in=options["doc_id"]) count = q.count() opinions = queryset_generator(q, chunksize=10000) node_count = edge_count = completed = 0 subtasks = [] for o in opinions: subtasks.append( # This will call the second function with the results from the # first. get_document_citations.s(o) | identify_parallel_citations.s()) last_item = count == completed + 1 if (completed % 50 == 0) or last_item: job = group(subtasks) result = job.apply_async().join() [ self.add_groups_to_network(citation_groups) for citation_groups in result ] subtasks = [] completed += 1 if completed % 250 == 0 or last_item: # Only do this once in a while. node_count = len(self.g.nodes()) edge_count = len(self.g.edges()) sys.stdout.write("\r Completed %s of %s. (%s nodes, %s edges)" % (completed, count, node_count, edge_count)) sys.stdout.flush() logger.info("\n\n## Entering phase two: Saving the best edges to " "the database.\n\n") for sub_graph in nx.connected_component_subgraphs(self.g): self.handle_subgraph(sub_graph, options) logger.info("\n\n## Done. Added %s new citations." % self.update_count) self.do_solr(options)