Пример #1
0
def _character_count_test(CIK, filing_year, new_data, corpus_file):
    
    parser_alpha_numeric_count =  Utilities.get_alpha_numeric_count(''.join(blob for blob in new_data))

    with open(corpus_file, 'r') as f:
        
        text_from_file = f.read()
        file_alpha_numeric_count = Utilities.get_alpha_numeric_count(text_from_file)
    
    change = (parser_alpha_numeric_count - file_alpha_numeric_count) / file_alpha_numeric_count
    result = abs(change) < Constants.REGRESSION_CHAR_COUNT_CHANGE_THRESHOLD
    
    print "CIK:%r, Year:%r, New Count:%r, " % (CIK, filing_year, parser_alpha_numeric_count),
    print "Corpus Count:%r, Passed:%r" % (file_alpha_numeric_count, result)
    
    if result is False:
        CorpusAccess.write_comparison_to_file(new_data, text_from_file, CIK, filing_year)
Пример #2
0
def _transform_list_of_hits_into_result(recorder, record_header):
    record = ''.join(recorder)

    #print "original:", record
    record = _cut_text_if_needed(record)
    #print "post:", record
    
    if re.search("SUBSEQUENT", record_header, re.I):
        if not _does_section_mention_litigation(record):
            record = None

    # almost all records are at least X chars. if not, it's 
    # probably something that we don't want.
    if record is not None and Utilities.get_alpha_numeric_count(record) < 200:
        record = None
    
    return record
def _get_best_result(results):
    ''' get the result with the smallest number of alphanumeric characters '''
    
    min_count = 0
    return_result = None
    
    for result in results:
    
        count = Utilities.get_alpha_numeric_count(result)
        
        if min_count == 0:
            min_count = count
            return_result = result
        
        elif count < min_count:
            min_count = count
            return_result = result
    
    return return_result