Пример #1
0
# from ..formulated_constants import include_toggled_phrases, include_toggled_pairs
import formulated_constants
import operator
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report
import re
import os
# http://www.albertauyeung.com/post/python-sequence-labelling-with-crf/

TRAIN_FOLDER = '../annotated'

cue_phrases = formulated_constants.categorical_phrases
cue_pairs = formulated_constants.categorical_pairs
# check to include variations in case
formulated_constants.include_toggled_phrases(cue_phrases)
formulated_constants.include_toggled_pairs(cue_pairs)

category_abbr = {
    "Argument":
    "A",
    "Fact":
    "F",
    "Issue":
    "I",
    "Ruling by lower court":
    "LR",
    "Ruling by the present court":
    "R",
    "Statute":
    "SS",
Пример #2
0
# selection is for summary, based on:
# 1. position of para in doc
# 2. position of para. in thematic segment
# 3. position of sentence in paragraph
# 4. tf-idf (position of word in document and corpus)
####

# Carry out thematic segmentation first
# filtering to eliminate unimportant quotations and noises
# selection of the candidate units
# production of table style summary <= Not relevant, only restrict limit, proper grammer

cue_phrases = formulated_constants.categorical_phrases
cue_pairs = formulated_constants.categorical_pairs
# check saravanan's constants	
include_toggled_phrases(cue_phrases)
include_toggled_pairs(cue_pairs)

def LetSum(file):
	'''
		receives a html file, produce categories for each of them and returns a summary
	'''
	label = OrderedDict()
	scores = OrderedDict()
	lines = {}

	t, _ = crf_test.parse_html(file)
	
	# break sentences to phrases
	# for line in t:
	# 	print(line)