def fetch_colloc_concordance(results, path, q, db, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering of most frequent 200 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_text = format_strip(conc_text, bytes) conc_text = convert_entities(conc_text) #conc_text = unicodedata.normalize('NFC', conc_text) start_highlight = conc_text.find('<span class="highlight"') m = end_highlight_match.search(conc_text) end_highlight = m.end(len(m.groups()) - 1) conc_left = conc_text[:start_highlight] conc_right = conc_text[end_highlight:] if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_colloc_concordance(results, path, q, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = q['collocate'].decode('utf-8', 'ignore') collocate_num = q['collocate_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction) else: words = tokenize(conc_left, filter_list, within_x_words, 'left') words.extend(tokenize(conc_right, filter_list, within_x_words, 'right')) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break return collocation_hitlist(new_hitlist, collocate_num)
def fetch_colloc_concordance(results, path, q, filter_words=100): within_x_words = q['word_num'] direction = q['direction'] collocate = q['collocate'] collocate_num = q['collocate_num'] ## set up filtering of most frequent 100 terms ## filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 filter_list = set([]) for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_words: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, 400) conc_text = f.get_text(hit, byte_start, 400, path) conc_left, conc_middle, conc_right = chunkifier(conc_text, bytes) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction) else: words = tokenize(conc_left, filter_list, within_x_words, 'left') words.extend(tokenize(conc_right, filter_list, within_x_words, 'right')) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) return collocation_hitlist(new_hitlist, collocate_num)
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True): length = config['concordance_length'] within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode('utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode('utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) if direction =='left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend(tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h
def fetch_colloc_concordance(results, path, q, db, config, word_filter=True, filter_num=100, stopwords=True): length = config['concordance_length'] within_x_words = q['word_num'] direction = q['direction'] collocate = unicodedata.normalize('NFC', q['collocate'].decode('utf-8', 'ignore')) collocate_num = q['collocate_num'] ## set up filtering with stopwords or 100 most frequent terms ## filter_list = set([q['q']]) if word_filter: if stopwords: filter_list_path = path + '/data/stopwords.txt' if os.path.isfile(filter_list_path): filter_words_file = open(filter_list_path) filter_num = float("inf") else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) else: filter_list_path = path + '/data/frequencies/word_frequencies' filter_words_file = open(filter_list_path) line_count = 0 for line in filter_words_file: line_count += 1 word = line.split()[0] filter_list.add(word.decode('utf-8', 'ignore')) if line_count > filter_num: break new_hitlist = [] for hit in results: ## get my chunk of text ## bytes, byte_start = adjust_bytes(hit.bytes, length) conc_text = f.get_text(hit, byte_start, length, path) ## Isolate left and right concordances conc_left = convert_entities(conc_text[:bytes[0]].decode( 'utf-8', 'ignore')) conc_left = begin_match.sub('', conc_left) conc_left = start_cutoff_match.sub('', conc_left) conc_right = convert_entities(conc_text[bytes[-1]:].decode( 'utf-8', 'ignore')) conc_right = end_match.sub('', conc_right) conc_right = left_truncate.sub('', conc_right) conc_left = strip_tags(conc_left) conc_right = strip_tags(conc_right) if direction == 'left': words = tokenize(conc_left, filter_list, within_x_words, direction, db) elif direction == 'right': words = tokenize(conc_right, filter_list, within_x_words, direction, db) else: words = tokenize(conc_left, filter_list, within_x_words, 'left', db) words.extend( tokenize(conc_right, filter_list, within_x_words, 'right', db)) if collocate in set(words): count = words.count(collocate) hit.collocate_num = count new_hitlist.append(hit) if len(new_hitlist) > (q["start"] + q["results_per_page"]): break h = collocation_hitlist(new_hitlist, collocate_num) return h