def test_citation_coverage(scotus): """ Test the coverage of the citation parsing across all SCOTUS documents. """ # Store outcomes outcome_data = [] # Iterate over documents for file_name in scotus.scotus_document_list: # Get the document doc = read_xml_document(scotus.read_document(file_name)) if doc: try: raw_value = get_text_lxml(doc.xpath(".//citation_line")[0]) raw_value = raw_value.strip().replace("\r", "").replace("\n", "") except IndexError: raw_value = "" parsed_value = "; ".join(read_xml_citation_list(doc)) else: raw_value = None parsed_value = None # Append to the outcome data outcome_data.append( (os.path.basename(file_name), raw_value, parsed_value)) return outcome_data
def test_sentence_parser(scotus): """ Test handling of sentence parsing across a random subset of SCOTUS documents. """ # Choose the document sample size num_sample = min(20, len(scotus.scotus_document_list)) # Get the shuffled document list with hard-coded seed document_list = scotus.scotus_document_list random.seed(0) random.shuffle(document_list) # Store sentences tree_list = [] # Create the SCOTUS for file_name in scotus.scotus_document_list[0:num_sample]: # Get the document and tokenize doc = read_xml_document(scotus.read_document(file_name)) file_sentences = [sent.strip().replace("\n", "").replace("\r", "") \ for sent in sentence_tokenizer\ .sentences_from_text(read_xml_opinion(doc))] #TODO: Decide if we want to batch parse with more RAM. num_sentences = min(20, len(file_sentences)) random.seed(0) random.shuffle(file_sentences) file_trees = sentence_parser.raw_parse_sents(\ file_sentences[0:num_sentences]) tree_list.extend([(file_name, str(t)) for t in file_trees]) return tree_list
def test_sentence_tokenizer(scotus): """ Test handling of sentence tokenization across a random subset of SCOTUS documents. """ # Choose the document sample size num_sample = min(100, len(scotus.scotus_document_list)) # Get the shuffled document list with hard-coded seed document_list = scotus.scotus_document_list random.seed(0) random.shuffle(document_list) # Store sentences sentence_list = [] # Create the SCOTUS for file_name in scotus.scotus_document_list[0:num_sample]: # Get the document and tokenize doc = read_xml_document(scotus.read_document(file_name)) file_sentences = [sent.strip().replace("\n", "").replace("\r", "") \ for sent in sentence_tokenizer\ .sentences_from_text(read_xml_opinion(doc))] sentence_list.extend([(file_name, sent) for sent in file_sentences]) return sentence_list
def test_citation_coverage(scotus): """ Test the coverage of the citation parsing across all SCOTUS documents. """ # Store outcomes outcome_data = [] # Iterate over documents for file_name in scotus.scotus_document_list: # Get the document doc = read_xml_document(scotus.read_document(file_name)) if doc: try: raw_value = get_text_lxml(doc.xpath(".//citation_line")[0]) raw_value = raw_value.strip().replace("\r", "").replace("\n", "") except IndexError: raw_value = "" parsed_value = "; ".join(read_xml_citation_list(doc)) else: raw_value = None parsed_value = None # Append to the outcome data outcome_data.append((os.path.basename(file_name), raw_value, parsed_value)) return outcome_data
phrase_mapping = dict(zip(phrase_list, phrase_id)) # Write the phrase mapping to disk phrase_mapping_file = codecs.open( os.path.join(SCRATCH_PATH, "results", "ws_phrase_mapping.csv"), 'w', 'utf8') for phrase in phrase_mapping: phrase_mapping_file.write( u"\t".join([str(phrase_mapping[phrase]), ' '.join(phrase)]) + "\n") phrase_mapping_file.close() # Iterate over documents for file_name in scotus.scotus_document_list: # Get the XML document and sentence list try: doc = read_xml_document(scotus.read_document(file_name)) sentence_list = [s.strip().replace("\n", "").replace("\r", "") \ for s in sentence_tokenizer \ .sentences_from_text(read_xml_opinion(doc))] except Exception, E: print(E) continue for sentence in sentence_list: # Process the sentences sentence_stems = tuple(process_sentence(sentence)) # Inner loop for phrase in phrase_list: # Skip phrases that are too long if len(phrase) > sentence_stems:
# Load the dataset scotus = SCOTUS() # Store the sentence list ws_list = [] # Output to file sentence_file = codecs.open(os.path.join(SCRATCH_PATH, "results", "sentence_list.csv"), 'w', 'utf8') # Iterate over documents for file_name in scotus.scotus_document_list: # Get the XML document and sentence list try: doc = read_xml_document(scotus.read_document(file_name)) sentence_list = [s.strip().replace("\n", "").replace("\r", "") \ for s in sentence_tokenizer\ .sentences_from_text(read_xml_opinion(doc))] except Exception, E: print(E) continue # Iterate over sentences and find matches for sentence in sentence_list: if "-established" in sentence.lower(): ws_list.append((os.path.basename(file_name), True, sentence)) elif "-settled" in sentence.lower(): ws_list.append((os.path.basename(file_name),