def get_issues():
    n_issues = 0
    issues_by_case = defaultdict(list)
    meta_file = util_sc.init_metadata(data_dir)
    case_file_by_year = util_sc.get_case_from_metafile(meta_file, data_dir)
    for year in case_file_by_year:
        current_cases = case_file_by_year[year]
        #print current_cases
        #print len(current_cases)
        for case in current_cases:
            case_file_path = year + '/' + str(case) + '.xml'
            abs_case_file_path = data_dir + '/' + case_file_path
            tree = et.ElementTree(file=abs_case_file_path)
            case_issues = tree.find("issues").text.split(',')
            for i in xrange(0, len(case_issues)):
                issue = case_issues[i].strip().lower()
                # print "case issues %s" % issue.strip()
                if issue not in issue_id:
                    issue_id[issue] = n_issues
                    reverse_issue_id[n_issues] = issue
                    n_issues += 1
                issues_by_case[case].append(issue_id[issue])

    #print len(issues_by_case)
    #print n_issues
    return issues_by_case, n_issues
def build_dataset(n_tokens, n_topics):
    meta_file = util_sc.init_metadata(data_dir)
    case_file_by_year = util_sc.get_case_from_metafile(meta_file, data_dir)