def opinion_ids_to_names(opinion_ids: Iterable[str]) -> List[str]: op_names = [] for op_id in opinion_ids: if (op_model := Opinion.select().where( Opinion.resource_id == op_id).first()) is not None: op_names.append(op_model.cluster.case_name) else: op_names.append("Unknown")
def join_to_clusters( base_citation_query: ModelSelect, ) -> Tuple[ModelSelect, ModelAlias, ModelAlias]: CitingOpinion, CitedOpinion = Opinion.alias(), Opinion.alias() CitingCluster, CitedCluster = Cluster.alias(), Cluster.alias() return ( (base_citation_query.join_from( Citation, CitingOpinion, on=Citation.citing_opinion).join_from( Citation, CitedOpinion, on=Citation.cited_opinion).join_from( CitingOpinion, CitingCluster).join_from(CitedOpinion, CitedCluster)), CitingCluster, CitedCluster, )
def get_case_html(resource_id: int): try: opinion = Opinion.get(resource_id=resource_id) if not opinion.html_text: raise FileNotFoundError() return opinion.html_text except (Opinion.DoesNotExist, FileNotFoundError): abort(HTTPStatus.NOT_FOUND)
def search_cases(query, max_cases=25): search_text = CaseSearch.prepare_query(query) return (Opinion.select( Opinion, fn.ts_headline(Cluster.case_display_name(), fn.to_tsquery(search_text)).alias("headline"), ).join(Cluster).where( ts_match(Cluster.searchable_case_name, fn.to_tsquery(search_text))).order_by( Cluster.citation_count.desc()).limit(max_cases))
def nearest(resource_id): # Both scipy.spatial.kdtrees take too mcuh memory global citing global mat vec = mat[np.where(citing == resource_id)[0][0]] mat -= vec mat = np.linalg.norm(mat, axis=1) idx = np.argsort(mat) names = [] for i, id_ in enumerate(idx[:50]): opinion = Opinion.get(Opinion.resource_id == citing[id_]) names.append("{}: {}".format(i, opinion.cluster.case_name)) return names
def get_case_clusters(): case_resource_ids = [int(c) for c in request.args.getlist("cases")] num_clusters = int(request.args.get("num_clusters") or 0) or None if len(case_resource_ids) < 1: return "You must provide at least one case ID.", HTTPStatus.UNPROCESSABLE_ENTITY clusters = clustering.spectral_cluster(set(case_resource_ids), num_clusters=num_clusters) output_dict = {} for cluster_name, opinion_ids in clusters.items(): opinion_models = Opinion.select().where( Opinion.resource_id << opinion_ids) output_dict[str(cluster_name)] = model_list_to_dicts(opinion_models) return output_dict
def ingest_opinion_data(opinions_dir): opinion_records = [] directory = os.fsencode(opinions_dir) for file in os.listdir(directory): try: filename = os.fsdecode(file) if filename.endswith(".json"): file_path = os.path.join(opinions_dir, filename) with open(file_path, encoding="utf8") as json_file: opinion_data = json.load(json_file) cluster_uri = opinion_data["cluster"] cluster_id = int(cluster_uri.split("/")[-2]) new_record = Opinion( resource_id=opinion_data["id"], opinion_uri=opinion_data["resource_uri"], cluster_uri=cluster_uri, cluster=cluster_id, ) opinion_records.append(new_record) except: print(f"Failure on file {file}") with db.atomic(): Opinion.bulk_create(opinion_records, batch_size=100)
def get_recommended_cases(): case_resource_ids = frozenset(map(int, request.args.getlist("cases"))) court_ids = frozenset(map(str, request.args.getlist("courts"))) max_cases = int(request.args.get("max_cases") or 10) if len(case_resource_ids) < 1: return "You must provide at least one case ID.", HTTPStatus.UNPROCESSABLE_ENTITY recommendations = recommendation.recommendations(case_resource_ids, max_cases, courts=court_ids) recommended_opinions = sorted( Opinion.select().join(Cluster).where( Opinion.resource_id << list(recommendations.keys())), key=lambda op: recommendations[op.resource_id], reverse=True, ) return model_list_to_json(recommended_opinions)
def ingest_citation_data(citations_file): # Since there's only ~65,000 opinions, it's feasible to just load all the IDs into memory to avoid making # millions of DB queries. opinion_set = {o.resource_id for o in Opinion.select()} citation_records = [] with open(citations_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=",") for row in csv_reader: try: integer_row = [int(cell) for cell in row] if integer_row[0] in opinion_set and integer_row[1] in opinion_set: new_record = Citation( citing_opinion=integer_row[0], cited_opinion=integer_row[1], depth=integer_row[2], ) citation_records.append(new_record) except Exception as e: print(f"Failure on row {row}: {e}") with db.atomic(): Citation.bulk_create(citation_records, batch_size=100)
import networkx as nx from db.peewee.models import db, Opinion from graph.citation_network import CitationNetwork citation_graph = CitationNetwork.construct_network() centrality = nx.eigenvector_centrality_numpy(citation_graph) top_opinions = [ opinion_id for opinion_id, centrality_score in sorted( centrality.items(), key=lambda item: item[1], reverse=True) ][:100] db.connect() output_str = "" for i, opinion_id in enumerate(top_opinions): try: opinion = Opinion.get(Opinion.resource_id == opinion_id) output_str += f"{i + 1}: {opinion.resource_id}, {opinion.cluster.case_name}\n" except: pass print(output_str)
def get_case(resource_id: int): try: opinion = Opinion.get(resource_id=resource_id) return model_to_dict(opinion, **DEFAULT_SERIALIZATION_ARGS) except Opinion.DoesNotExist: abort(HTTPStatus.NOT_FOUND)