def fetch_relevance(hit, path, q, kwic=True, samples=3): length = 400 text_snippet = [] if len(hit.bytes) >= samples: byte_sample = sample(hit.bytes, samples) else: byte_sample = hit.bytes for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True, kwic=kwic) conc_start = clean_text(conc_start, kwic=kwic) conc_end = clean_text(conc_end, kwic=kwic) if kwic: conc_middle = clean_text(conc_middle, notag=False, kwic=kwic) conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') conc_text = align_text(conc_text, 1) else: conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') text_snippet.append(conc_text) if kwic: text = '<br>\n'.join(text_snippet) else: text = '... '.join(text_snippet) return text
def fetch_collocation(results, path, q, filter_words=100): within_x_words = q['word_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break ## start going though hits ## left_collocates = {} right_collocates = {} all_collocates = {} count = 0 for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) left_words = tokenize(conc_left, filter_list, within_x_words, 'left') right_words = tokenize(conc_right, filter_list, within_x_words, 'right') for l_word in left_words: if l_word == q['q']: continue if l_word not in left_collocates: left_collocates[l_word] = 0 left_collocates[l_word] += 1 if l_word not in all_collocates: all_collocates[l_word] = 0 all_collocates[l_word] += 1 for r_word in right_words: if r_word == q['q']: continue if r_word not in right_collocates: right_collocates[r_word] = 0 if r_word not in all_collocates: all_collocates[r_word] = 0 right_collocates[r_word] += 1 all_collocates[r_word] += 1 left_out = sorted(left_collocates.items(), key=lambda x: x[1], reverse=True)[:100] right_out = sorted(right_collocates.items(), key=lambda x: x[1], reverse=True)[:100] all_out = sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)[:100] tuple_out = zip(all_out, left_out, right_out) return tuple_out
def fetch_collocation(results, path, q, word_filter=True, filter_num=200, full_report=True): within_x_words = q['word_num'] ## set up filtering of most frequent 200 terms ## filter_list = set([]) if word_filter: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break ## start going though hits ## left_collocates = defaultdict(int) right_collocates = defaultdict(int) all_collocates = defaultdict(int) count = 0 if not full_report: q['colloc_start'] = None q['colloc_end'] = None for hit in results[q['colloc_start']:q['colloc_end']]: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) left_words = tokenize(conc_left, filter_list, within_x_words, 'left') right_words = tokenize(conc_right, filter_list, within_x_words, 'right') query_words = set([w.decode('utf-8') for w in q['q'].split('|')]) for l_word in left_words: if l_word in query_words: continue left_collocates[l_word] += 1 all_collocates[l_word] += 1 for r_word in right_words: if r_word in query_words: continue right_collocates[r_word] += 1 all_collocates[r_word] += 1 if full_report: return dict(all_collocates), dict(left_collocates), dict(right_collocates) else: return sorted(all_collocates.items(), key=lambda x: x[1], reverse=True)
def fetch_relevance(hit, path, q, samples=10): length = 75 text_snippet = [] byte_sample = hit.bytes[:samples] for byte in byte_sample: byte = [int(byte)] bytes, byte_start = adjust_bytes(byte, length) conc_text = f.get_text(hit, byte_start, length, path) conc_start, conc_middle, conc_end = chunkifier(conc_text, bytes, highlight=True) conc_start = clean_text(conc_start) conc_end = clean_text(conc_end) conc_text = (conc_start + conc_middle + conc_end).decode('utf-8', 'ignore') text_snippet.append(conc_text) text = ' ... '.join(text_snippet) return text
def fetch_colloc_concordance(results, path, q, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = q['collocate'].decode('utf-8', 'ignore') collocate_num = q['collocate_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction) else: words = tokenize(conc_left, filter_list, within_x_words, 'left') words.extend(tokenize(conc_right, filter_list, within_x_words, 'right')) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break return collocation_hitlist(new_hitlist, collocate_num)
def fetch_colloc_concordance(results, path, q, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = q['collocate'] collocate_num = q['collocate_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction) else: words = tokenize(conc_left, filter_list, within_x_words, 'left') words.extend(tokenize(conc_right, filter_list, within_x_words, 'right')) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) return collocation_hitlist(new_hitlist, collocate_num)
def fetch_collocation(results, path, q, filter_words=100): within_x_words = q['word_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break ## start going though hits ## left_collocates = {} right_collocates = {} all_collocates = {} count = 0 for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) left_words = tokenize(conc_left, filter_list, within_x_words, 'left') if not sum([len(i) for i in left_words]): count += 1 right_words = tokenize(conc_right, filter_list, within_x_words, 'right') for l_word in left_words: if l_word == q['q']: continue if l_word not in left_collocates: left_collocates[l_word] = 0 left_collocates[l_word] += 1 if l_word not in all_collocates: all_collocates[l_word] = 0 all_collocates[l_word] += 1 for r_word in right_words: if r_word == q['q']: continue if r_word not in right_collocates: right_collocates[r_word] = 0 if r_word not in all_collocates: all_collocates[r_word] = 0 right_collocates[r_word] += 1 all_collocates[r_word] += 1 left_out = sorted(left_collocates.items(), key=lambda x: x[1], reverse=True) right_out = sorted(right_collocates.items(), key=lambda x: x[1], reverse=True) all_out = sorted(all_collocates.items(), key=lambda x: x[1], reverse=True) tuple_out = zip(all_out, left_out, right_out) print >> sys.stderr, "COUNT", count return tuple_out