示例#1
0
def test_citation_coverage(scotus):
    """
    Test the coverage of the citation parsing across all SCOTUS documents.
    """
    # Store outcomes
    outcome_data = []

    # Iterate over documents
    for file_name in scotus.scotus_document_list:
        # Get the document
        doc = read_xml_document(scotus.read_document(file_name))
        if doc:
            try:
                raw_value = get_text_lxml(doc.xpath(".//citation_line")[0])
                raw_value = raw_value.strip().replace("\r",
                                                      "").replace("\n", "")
            except IndexError:
                raw_value = ""

            parsed_value = "; ".join(read_xml_citation_list(doc))
        else:
            raw_value = None
            parsed_value = None

        # Append to the outcome data
        outcome_data.append(
            (os.path.basename(file_name), raw_value, parsed_value))

    return outcome_data
示例#2
0
def test_sentence_parser(scotus):
    """
    Test handling of sentence parsing across a random subset of 
    SCOTUS documents.
    """
    # Choose the document sample size
    num_sample = min(20, len(scotus.scotus_document_list))

    # Get the shuffled document list with hard-coded seed
    document_list = scotus.scotus_document_list
    random.seed(0)
    random.shuffle(document_list)

    # Store sentences
    tree_list = []

    # Create the SCOTUS
    for file_name in scotus.scotus_document_list[0:num_sample]:
        # Get the document and tokenize
        doc = read_xml_document(scotus.read_document(file_name))
        file_sentences = [sent.strip().replace("\n", "").replace("\r", "") \
                             for sent in sentence_tokenizer\
                             .sentences_from_text(read_xml_opinion(doc))]

        #TODO: Decide if we want to batch parse with more RAM.
        num_sentences = min(20, len(file_sentences))
        random.seed(0)
        random.shuffle(file_sentences)
        file_trees = sentence_parser.raw_parse_sents(\
                                             file_sentences[0:num_sentences])
        tree_list.extend([(file_name, str(t)) for t in file_trees])

    return tree_list
示例#3
0
def test_sentence_tokenizer(scotus):
    """
    Test handling of sentence tokenization across a random subset of 
    SCOTUS documents.
    """
    # Choose the document sample size
    num_sample = min(100, len(scotus.scotus_document_list))

    # Get the shuffled document list with hard-coded seed
    document_list = scotus.scotus_document_list
    random.seed(0)
    random.shuffle(document_list)

    # Store sentences
    sentence_list = []

    # Create the SCOTUS
    for file_name in scotus.scotus_document_list[0:num_sample]:
        # Get the document and tokenize
        doc = read_xml_document(scotus.read_document(file_name))
        file_sentences = [sent.strip().replace("\n", "").replace("\r", "") \
                             for sent in sentence_tokenizer\
                             .sentences_from_text(read_xml_opinion(doc))]
        sentence_list.extend([(file_name, sent) for sent in file_sentences])

    return sentence_list
示例#4
0
def test_citation_coverage(scotus):
    """
    Test the coverage of the citation parsing across all SCOTUS documents.
    """
    # Store outcomes
    outcome_data = []
    
    # Iterate over documents
    for file_name in scotus.scotus_document_list:
        # Get the document
        doc = read_xml_document(scotus.read_document(file_name))
        if doc:
            try:
                raw_value = get_text_lxml(doc.xpath(".//citation_line")[0])
                raw_value = raw_value.strip().replace("\r", "").replace("\n", "")
            except IndexError:
                raw_value = ""
        
            parsed_value = "; ".join(read_xml_citation_list(doc))
        else:
            raw_value = None
            parsed_value = None
        
        # Append to the outcome data
        outcome_data.append((os.path.basename(file_name),
                             raw_value, parsed_value))

    return outcome_data
示例#5
0
def test_sentence_parser(scotus):
    """
    Test handling of sentence parsing across a random subset of 
    SCOTUS documents.
    """
    # Choose the document sample size
    num_sample = min(20, len(scotus.scotus_document_list))
    
    # Get the shuffled document list with hard-coded seed
    document_list = scotus.scotus_document_list
    random.seed(0)
    random.shuffle(document_list)
    
    # Store sentences
    tree_list = []
    
    # Create the SCOTUS 
    for file_name in scotus.scotus_document_list[0:num_sample]:
        # Get the document and tokenize
        doc = read_xml_document(scotus.read_document(file_name)) 
        file_sentences = [sent.strip().replace("\n", "").replace("\r", "") \
                             for sent in sentence_tokenizer\
                             .sentences_from_text(read_xml_opinion(doc))]
        
        #TODO: Decide if we want to batch parse with more RAM.
        num_sentences = min(20, len(file_sentences))
        random.seed(0)
        random.shuffle(file_sentences)
        file_trees = sentence_parser.raw_parse_sents(\
                                             file_sentences[0:num_sentences])
        tree_list.extend([(file_name, str(t)) for t in file_trees])
    
    return tree_list 
示例#6
0
def test_sentence_tokenizer(scotus):
    """
    Test handling of sentence tokenization across a random subset of 
    SCOTUS documents.
    """
    # Choose the document sample size
    num_sample = min(100, len(scotus.scotus_document_list))
    
    # Get the shuffled document list with hard-coded seed
    document_list = scotus.scotus_document_list
    random.seed(0)
    random.shuffle(document_list)
    
    # Store sentences
    sentence_list = []
    
    # Create the SCOTUS 
    for file_name in scotus.scotus_document_list[0:num_sample]:
        # Get the document and tokenize
        doc = read_xml_document(scotus.read_document(file_name)) 
        file_sentences = [sent.strip().replace("\n", "").replace("\r", "") \
                             for sent in sentence_tokenizer\
                             .sentences_from_text(read_xml_opinion(doc))]
        sentence_list.extend([(file_name, sent) for sent in file_sentences])
    
    return sentence_list 
    phrase_mapping = dict(zip(phrase_list, phrase_id))

    # Write the phrase mapping to disk
    phrase_mapping_file = codecs.open(
        os.path.join(SCRATCH_PATH, "results", "ws_phrase_mapping.csv"), 'w',
        'utf8')
    for phrase in phrase_mapping:
        phrase_mapping_file.write(
            u"\t".join([str(phrase_mapping[phrase]), ' '.join(phrase)]) + "\n")
    phrase_mapping_file.close()

    # Iterate over documents
    for file_name in scotus.scotus_document_list:
        # Get the XML document and sentence list
        try:
            doc = read_xml_document(scotus.read_document(file_name))
            sentence_list = [s.strip().replace("\n", "").replace("\r", "") \
                             for s in sentence_tokenizer \
                    .sentences_from_text(read_xml_opinion(doc))]
        except Exception, E:
            print(E)
            continue

        for sentence in sentence_list:
            # Process the sentences
            sentence_stems = tuple(process_sentence(sentence))

            # Inner loop
            for phrase in phrase_list:
                # Skip phrases that are too long
                if len(phrase) > sentence_stems:
    # Load the dataset
    scotus = SCOTUS()

    # Store the sentence list
    ws_list = []

    # Output to file
    sentence_file = codecs.open(os.path.join(SCRATCH_PATH, "results",
                                             "sentence_list.csv"),
                                'w', 'utf8')

    # Iterate over documents
    for file_name in scotus.scotus_document_list:
        # Get the XML document and sentence list
        try:
            doc = read_xml_document(scotus.read_document(file_name))
            sentence_list = [s.strip().replace("\n", "").replace("\r", "") \
                             for s in sentence_tokenizer\
                             .sentences_from_text(read_xml_opinion(doc))]
        except Exception, E:
            print(E)
            continue

        # Iterate over sentences and find matches
        for sentence in sentence_list:
            if "-established" in sentence.lower():
                ws_list.append((os.path.basename(file_name),
                                True,
                                sentence))
            elif "-settled" in sentence.lower():
                ws_list.append((os.path.basename(file_name),