Exemplo n.º 1
0
def fetch_relevance(hit, path, q, kwic=True, samples=3):
    length = 400
    text_snippet = []
    if len(hit.bytes) >= samples:
        byte_sample = sample(hit.bytes, samples)
    else:
        byte_sample = hit.bytes
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic)
        conc_start = clean_text(conc_start, kwic=kwic)
        conc_end = clean_text(conc_end, kwic=kwic)
        if kwic:
            conc_middle = clean_text(conc_middle, notag=False, kwic=kwic)
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
            conc_text = align_text(conc_text, 1)
        else:
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    if kwic:
        text = '<br>\n'.join(text_snippet)
    else:
        text = '... '.join(text_snippet)
    return text
    
     
Exemplo n.º 2
0
def fetch_relevance(hit, path, q, kwic=True, samples=3):
    length = 400
    text_snippet = []
    if len(hit.bytes) >= samples:
        byte_sample = sample(hit.bytes, samples)
    else:
        byte_sample = hit.bytes
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic)
        conc_start = clean_text(conc_start, kwic=kwic)
        conc_end = clean_text(conc_end, kwic=kwic)
        if kwic:
            conc_middle = clean_text(conc_middle, notag=False, kwic=kwic)
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
            conc_text = align_text(conc_text, 1)
        else:
            conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    if kwic:
        text = '<br>\n'.join(text_snippet)
    else:
        text = '... '.join(text_snippet)
    return text
def fetch_collocation(results, path, q, filter_words=100):
    within_x_words = q['word_num']    
    
    ## set up filtering of most frequent 100 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
                break
    
    ## start going though hits ##
    left_collocates = {}
    right_collocates = {}
    all_collocates = {}
    
    count = 0
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)
        
        left_words = tokenize(conc_left, filter_list, within_x_words, 'left')
        right_words = tokenize(conc_right, filter_list, within_x_words, 'right')
    
        for l_word in left_words:
            if l_word == q['q']:
                continue
            if l_word not in left_collocates:
                left_collocates[l_word] = 0
            left_collocates[l_word] += 1
            if l_word not in all_collocates:
                all_collocates[l_word] = 0
            all_collocates[l_word] += 1 

        for r_word in right_words:
            if r_word == q['q']:
                continue
            if r_word not in right_collocates:
                right_collocates[r_word] = 0
            if r_word not in all_collocates:
                all_collocates[r_word] = 0
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1

    left_out = sorted(left_collocates.items(), key=lambda x: x[1], reverse=True)[:100]
    right_out = sorted(right_collocates.items(), key=lambda x: x[1], reverse=True)[:100]
    all_out = sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)[:100]

    tuple_out = zip(all_out, left_out, right_out)
    return tuple_out
def fetch_collocation(results, path, q, word_filter=True, filter_num=200, full_report=True):
    within_x_words = q['word_num']    
    
    ## set up filtering of most frequent 200 terms ##
    filter_list = set([])
    if word_filter:
        filter_list_path = path + '/data/frequencies/word_frequencies'
        filter_words_file = open(filter_list_path)
        line_count = 0 
        for line in filter_words_file:
            line_count += 1
            word = line.split()[0]
            filter_list.add(word.decode('utf-8', 'ignore'))
            if line_count > filter_num:
                    break
    
    ## start going though hits ##
    left_collocates = defaultdict(int)
    right_collocates = defaultdict(int)
    all_collocates = defaultdict(int)
    
    count = 0
    if not full_report:
        q['colloc_start'] = None
        q['colloc_end'] = None
    for hit in results[q['colloc_start']:q['colloc_end']]:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)
        
        left_words = tokenize(conc_left, filter_list, within_x_words, 'left')
        right_words = tokenize(conc_right, filter_list, within_x_words, 'right')
        
        query_words = set([w.decode('utf-8') for w in q['q'].split('|')])
        
        for l_word in left_words:
            if l_word in query_words:
                continue
            left_collocates[l_word] += 1
            all_collocates[l_word] += 1 

        for r_word in right_words:
            if r_word in query_words:
                continue
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1    

    if full_report:
        return dict(all_collocates), dict(left_collocates), dict(right_collocates)
    else:
        return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
Exemplo n.º 5
0
def fetch_relevance(hit, path, q, samples=10):
    length = 75
    text_snippet = []
    byte_sample = hit.bytes[:samples]
    for byte in byte_sample: 
        byte = [int(byte)]
        bytes, byte_start = adjust_bytes(byte, length)
        conc_text = f.get_text(hit, byte_start, length, path)
        conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True)
        conc_start = clean_text(conc_start)
        conc_end = clean_text(conc_end)
        conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore')
        text_snippet.append(conc_text)
    text = ' ... '.join(text_snippet)
    return text
def fetch_colloc_concordance(results, path, q, filter_words=100):
    within_x_words = q['word_num']
    direction = q['direction']
    collocate = q['collocate'].decode('utf-8', 'ignore')
    collocate_num = q['collocate_num']
    
    ## set up filtering of most frequent 100 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
                break
    
    new_hitlist = []
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)
        if direction =='left':
            words = tokenize(conc_left, filter_list, within_x_words, direction)
        elif direction == 'right':
            words = tokenize(conc_right, filter_list, within_x_words, direction)
        else:
            words = tokenize(conc_left, filter_list, within_x_words, 'left')
            words.extend(tokenize(conc_right, filter_list, within_x_words, 'right'))
        if collocate in set(words):
            count = words.count(collocate)
            hit.collocate_num = count
            new_hitlist.append(hit)

        if len(new_hitlist) > (q["start"] + q["results_per_page"]):
            break
            
    
    return collocation_hitlist(new_hitlist, collocate_num)
def fetch_colloc_concordance(results, path, q, filter_words=100):
    within_x_words = q['word_num']
    direction = q['direction']
    collocate = q['collocate']
    collocate_num = q['collocate_num']
    
    ## set up filtering of most frequent 100 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
                break
    
    new_hitlist = []
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)
        if direction =='left':
            words = tokenize(conc_left, filter_list, within_x_words, direction)
        elif direction == 'right':
            words = tokenize(conc_right, filter_list, within_x_words, direction)
        else:
            words = tokenize(conc_left, filter_list, within_x_words, 'left')
            words.extend(tokenize(conc_right, filter_list, within_x_words, 'right'))
        if collocate in set(words):
            count = words.count(collocate)
            hit.collocate_num = count
            new_hitlist.append(hit)
    
    return collocation_hitlist(new_hitlist, collocate_num)
Exemplo n.º 8
0
def fetch_collocation(results, path, q, filter_words=100):
    within_x_words = q['word_num']

    ## set up filtering of most frequent 100 terms ##
    filter_list_path = path + '/data/frequencies/word_frequencies'
    filter_words_file = open(filter_list_path)

    line_count = 0
    filter_list = set([])

    for line in filter_words_file:
        line_count += 1
        word = line.split()[0]
        filter_list.add(word.decode('utf-8', 'ignore'))
        if line_count > filter_words:
            break

    ## start going though hits ##
    left_collocates = {}
    right_collocates = {}
    all_collocates = {}

    count = 0
    for hit in results:
        ## get my chunk of text ##
        bytes, byte_start = adjust_bytes(hit.bytes, 400)
        conc_text = f.get_text(hit, byte_start, 400, path)
        conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes)

        left_words = tokenize(conc_left, filter_list, within_x_words, 'left')
        if not sum([len(i) for i in left_words]):
            count += 1
        right_words = tokenize(conc_right, filter_list, within_x_words,
                               'right')

        for l_word in left_words:
            if l_word == q['q']:
                continue
            if l_word not in left_collocates:
                left_collocates[l_word] = 0
            left_collocates[l_word] += 1
            if l_word not in all_collocates:
                all_collocates[l_word] = 0
            all_collocates[l_word] += 1

        for r_word in right_words:
            if r_word == q['q']:
                continue
            if r_word not in right_collocates:
                right_collocates[r_word] = 0
            if r_word not in all_collocates:
                all_collocates[r_word] = 0
            right_collocates[r_word] += 1
            all_collocates[r_word] += 1

    left_out = sorted(left_collocates.items(),
                      key=lambda x: x[1],
                      reverse=True)
    right_out = sorted(right_collocates.items(),
                       key=lambda x: x[1],
                       reverse=True)
    all_out = sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)

    tuple_out = zip(all_out, left_out, right_out)
    print >> sys.stderr, "COUNT", count
    return tuple_out