# for the last word of a sentence else: # separate sentences with line breaks and a period, if there is no other symbol to mark the end of sentence if bool(re.search(r"\w*[.?!]$", w)) or bool( re.search(r"\.[‘’“”\'\"]$", w)): text_out.write('\n') else: text_out.write('.\n') text_out.close() # abbreviations for sentence tokenisation extra_abbreviations_en = [ 'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e', 'e.g', 'approx', 'apt', 'appt', 'dept', 'est', 'min', 'max', 'misc', 'no', 'acc', 'fig', 'a.m', 'p.m', 'a.d', 'b.c', 'etc', 'ca', 'cf', 'ed', 'est', 'f', 'ff', 'pres' ] from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters punkt_param = PunktParameters() # add the abbreviations to the tokenizer punkt_param.abbrev_types = set(extra_abbreviations_en) my_tokenizer = PunktSentenceTokenizer(punkt_param) # execute the preprocessing for all files in one directory for f in os.listdir("Data/English/theses"): # look for English texts only if f.startswith("en"): process("Data/English/theses/" + f, "PreprocessedData/English/theses/" + f + ".txt")
def marker_surr_patt(in_dir): """ Find most frequent POS tag patterns surrounding citation marker """ punkt_param = PunktParameters() abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) file_names = os.listdir(in_dir) patt_comb_freq_map = {} patt_orig_freq_map = {} for file_idx, fn in enumerate(file_names): if file_idx%100 == 0: print('{}/{}'.format(file_idx, len(file_names))) path = os.path.join(in_dir, fn) aid, ext = os.path.splitext(fn) if ext != '.txt' or aid == 'log': continue if re.search(r'[a-z]', aid): split = re.search(r'[a-z][0-9]', aid).span()[0] + 1 aid = aid[:split] + '/' + aid[split:] with open(path) as f: text = f.read() text = re.sub(E_G_PATT, 'e.g.', text) marker = ' \u241F ' doc_len = len(text) for sent_idx, sent_edx in tokenizer.span_tokenize(text): sentence_orig = text[sent_idx:sent_edx] sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig) sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence) if marker in sentence: words = pos_tag(sentence.split()) words = [w for w in words if re.search(r'[\w|\u241F]', w[0])] sent_len = len(words) indices = [i for i, tup in enumerate(words) if tup[0] == marker.strip()] for word_idx in indices: word = words[word_idx][0] if word == marker.strip(): patt_comb = [None, None, None, '[]', None, None, None] patt_orig = [None, None, None, '[]', None, None, None] for shift in range(-3, 4): x_idx = shift+3 if shift == 0: # marker itself continue if word_idx+shift < 0 or \ word_idx+shift >= len(words): patt_comb[x_idx] = '<EOS>' patt_orig[x_idx] = '<EOS>' continue wrd = words[word_idx+shift][0] pos = words[word_idx+shift][1] patt_orig[x_idx] = pos if 'V' in pos: patt_comb[x_idx] = 'V' elif pos in ['NN', 'NNS']: patt_comb[x_idx] = 'NN' elif pos in ['NNP', 'NNPS']: patt_comb[x_idx] = 'NNP' elif pos == 'IN': patt_comb[x_idx] = 'IN' elif 'JJ' in pos: patt_comb[x_idx] = 'JJ' elif 'W' in pos: patt_comb[x_idx] = 'WH' elif 'RB' in pos: patt_comb[x_idx] = 'ADV' elif 'PR' in pos: patt_comb[x_idx] = 'PR' elif wrd == 'FORMULA': patt_comb[x_idx] = 'FORMULA' elif wrd == 'FIGURE': patt_comb[x_idx] = 'FIGURE' elif wrd == 'TABLE': patt_comb[x_idx] = 'TABLE' else: patt_comb[x_idx] = 'OTHER' comb_id = '¦'.join(patt_comb) orig_id = '¦'.join(patt_orig) # # look at examples # if orig_id == 'VBN¦IN¦NNP¦[]¦<EOS>¦<EOS>¦<EOS>': # print(sentence) # input() # print('.') if comb_id not in patt_comb_freq_map: patt_comb_freq_map[comb_id] = 0 patt_comb_freq_map[comb_id] += 1 if orig_id not in patt_orig_freq_map: patt_orig_freq_map[orig_id] = 0 patt_orig_freq_map[orig_id] += 1 # if file_idx > 200: # break patt_comb_freq = sorted(patt_comb_freq_map.items(), key=operator.itemgetter(1), reverse=True) patt_orig_freq = sorted(patt_orig_freq_map.items(), key=operator.itemgetter(1), reverse=True) print('- - - C O M B - - -') for pid in patt_comb_freq[:25]: print(pid) print('- - - O R I G - - -') for pid in patt_orig_freq[:25]: print(pid) store_comb = [] for tup in patt_comb_freq: pid = tup[0] freq = tup[1] if '[]¦<EOS>¦<EOS>¦<EOS>' in pid: new_pid = pid.replace('[]¦<EOS>¦<EOS>¦<EOS>', '<EOS>') store_comb.append((new_pid, freq)) with open('marker_comb.json', 'w') as f: json.dump(store_comb, f) store_orig = [] for tup in patt_orig_freq: pid = tup[0] freq = tup[1] if '[]¦<EOS>¦<EOS>¦<EOS>' in pid: new_pid = pid.replace('[]¦<EOS>¦<EOS>¦<EOS>', '<EOS>') store_orig.append((new_pid, freq)) with open('marker_orig.json', 'w') as f: json.dump(store_orig, f)
def tokenize_latin_words(string): """ Tokenizer divides the string into a list of substrings >>> from cltk.corpus.utils.formatter import remove_non_ascii >>> text = 'Dices ἐστιν ἐμός pulchrum esse inimicos ulcisci.' >>> tokenize_latin_words(text) ['Dices', 'ἐστιν', 'ἐμός', 'pulchrum', 'esse', 'inimicos', 'ulcisci', '.'] :param string: This accepts the string value that needs to be tokenized :returns: A list of substrings extracted from the string """ from cltk.tokenize.latin_exceptions import latin_exceptions assert isinstance(string, str), "Incoming string must be type str." def matchcase(word): # From Python Cookbook def replace(m): text = m.group() if text.isupper(): return word.upper() elif text.islower(): return word.lower() elif text[0].isupper(): return word.capitalize() else: return word return replace replacements = [(r'mecum', 'cum me'), (r'tecum', 'cum te'), (r'secum', 'cum se'), (r'nobiscum', 'cum nobis'), (r'vobiscum', 'cum vobis'), (r'quocum', 'cum quo'), (r'quacum', 'cum qua'), (r'quicum', 'cum qui'), (r'quibuscum', 'cum quibus'), (r'sodes', 'si audes'), (r'satin', 'satis ne'), (r'scin', 'scis ne'), (r'sultis', 'si vultis'), (r'similist', 'similis est'), (r'qualist', 'qualis est')] for replacement in replacements: string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE) punkt_param = PunktParameters() abbreviations = [ 'c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop' ] punkt_param.abbrev_types = set(abbreviations) sent_tokenizer = PunktSentenceTokenizer(punkt_param) word_tokenizer = PunktLanguageVars() sents = sent_tokenizer.tokenize(string) enclitics = ['que', 'n', 'ue', 've', 'st'] exceptions = enclitics exceptions = list(set(exceptions + latin_exceptions)) tokens = [] for sent in sents: temp_tokens = word_tokenizer.word_tokenize(sent) # Need to check that tokens exist before handling them; # needed to make stream.readlines work in PlaintextCorpusReader if temp_tokens: if temp_tokens[0].endswith('ne'): if temp_tokens[0].lower() not in exceptions: temp = [temp_tokens[0][:-2], '-ne'] temp_tokens = temp + temp_tokens[1:] if temp_tokens[-1].endswith('.'): final_word = temp_tokens[-1][:-1] del temp_tokens[-1] temp_tokens += [final_word, '.'] for token in temp_tokens: tokens.append(token) # Break enclitic handling into own function? specific_tokens = [] for token in tokens: is_enclitic = False if token.lower() not in exceptions: for enclitic in enclitics: if token.endswith(enclitic): if enclitic == 'n': specific_tokens += [token[:-len(enclitic)]] + ['-ne'] elif enclitic == 'st': if token.endswith('ust'): specific_tokens += [token[:-len(enclitic) + 1] ] + ['est'] else: specific_tokens += [token[:-len(enclitic)] ] + ['est'] else: specific_tokens += [token[:-len(enclitic)] ] + ['-' + enclitic] is_enclitic = True break if not is_enclitic: specific_tokens.append(token) return specific_tokens
def read_xml(path): # pylint: disable=too-many-locals """read nxml, xml, html """ try: with open(path, 'rb') as f: s = f.read() s = s.decode('utf8') s = s.replace('<break/>', ', ') soup = BeautifulSoup(s, 'lxml') title = soup.find('article-title') title = title.getText(' ') if title is not None else '' title = clean_text(title) body = [title] tables = [] punkt_param = PunktParameters() punkt_param.abbrev_types = set(['fig']) tokenizer = PunktSentenceTokenizer(punkt_param) for tb in soup.findAll('table'): table = {'cells': []} for tr in tb.findAll(['tr']): row_elements = [] for td in tr.findAll(['td', 'th']): row_elements.append({'text': clean_text(td.getText(' '))}) table['cells'].append(row_elements) parent = tb while parent is not None and parent.find('label') is None: parent = parent.find_parent() if parent is not None: label = parent.find('label').getText(' ') caption_obj = parent.find('caption') if caption_obj is not None: caption = caption_obj.getText(' ') else: caption = '' else: label, caption = None, None table.update({'caption': { 'text': caption, 'label': label, }}) tables.append(table) for tag in ['notes', 'ref_list', 'floats-group']: for element in soup.findAll(tag): element.decompose() for paragraph in soup.findAll('p'): for t in paragraph.findAll('table'): t.extract() p = map(clean_text, paragraph.getText(' ').split()) p = ' '.join(filter(bool, p)) body += tokenizer.tokenize(p) body = '\n'.join(body) data = PaperData(body, tables) except Exception: logger.info('fail: %s', path) traceback.print_exc() return PaperData() return data
def __init__(self): self.punkt_param = PunktParameters() self.punkt_param.abbrev_types = set(ABBREVIATIONS) self.sent_tokenizer = LatinPunktSentenceTokenizer() self.word_tokenizer = LatinLanguageVars()
def init(self): if self.sent_tokeniser_ is None: punkt_param = PunktParameters() punkt_param.abbrev_types = self.compile_abbreviations() self.sent_tokeniser_ = PunktSentenceTokenizer(punkt_param)
def tokenize_latin_words(string): from cltk.tokenize.latin_exceptions import latin_exceptions assert isinstance(string, str), "Incoming string must be type str." def matchcase(word): # From Python Cookbook def replace(m): text = m.group() if text.isupper(): return word.upper() elif text.islower(): return word.lower() elif text[0].isupper(): return word.capitalize() else: return word return replace replacements = [(r'mecum', 'cum me'), (r'tecum', 'cum te'), (r'secum', 'cum se'), (r'nobiscum', 'cum nobis'), (r'vobiscum', 'cum vobis'), (r'quocum', 'cum quo'), (r'quacum', 'cum qua'), (r'quicum', 'cum qui'), (r'quibuscum', 'cum quibus'), (r'sodes', 'si audes'), (r'satin', 'satis ne'), (r'scin', 'scis ne'), (r'sultis', 'si vultis'), (r'similist', 'similis est'), (r'qualist', 'qualis est') ] for replacement in replacements: string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE) punkt_param = PunktParameters() abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop'] punkt_param.abbrev_types = set(abbreviations) sent_tokenizer = PunktSentenceTokenizer(punkt_param) word_tokenizer = PunktLanguageVars() sents = sent_tokenizer.tokenize(string) enclitics = ['que', 'n', 'ue', 've', 'st'] exceptions = enclitics exceptions = list(set(exceptions + latin_exceptions)) tokens = [] for sent in sents: temp_tokens = word_tokenizer.word_tokenize(sent) if temp_tokens[0].endswith('ne'): if temp_tokens[0].lower() not in exceptions: temp = [temp_tokens[0][:-2], '-ne'] temp_tokens = temp + temp_tokens[1:] if temp_tokens[-1].endswith('.'): final_word = temp_tokens[-1][:-1] del temp_tokens[-1] temp_tokens += [final_word, '.'] for token in temp_tokens: tokens.append(token) # Break enclitic handling into own function? specific_tokens = [] for token in tokens: is_enclitic = False if token.lower() not in exceptions: for enclitic in enclitics: if token.endswith(enclitic): if enclitic == 'n': specific_tokens += [token[:-len(enclitic)]] + ['-ne'] elif enclitic == 'st': if token.endswith('ust'): specific_tokens += [token[:-len(enclitic) + 1]] + ['est'] else: specific_tokens += [token[:-len(enclitic)]] + ['est'] else: specific_tokens += [token[:-len(enclitic)]] + ['-' + enclitic] is_enclitic = True break if not is_enclitic: specific_tokens.append(token) return specific_tokens
def data_to_words(data): global words punkt_param = PunktParameters() # 这里想对U.S缩略词进行处理,好像失败了 abbreviation = ['U.S'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) pat_letter = re.compile(r'[^a-zA-Z \']+') # 对一些缩略词进行转换处理 # to find the 's following the pronouns. re.I is refers to ignore case pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I) # to find the 's following the letters pat_s = re.compile("(?<=[a-zA-Z])\'s") # to find the ' following the words ending by s pat_s2 = re.compile("(?<=s)\'s?") # to find the abbreviation of not pat_not = re.compile("(?<=[a-zA-Z])n\'t") # to find the abbreviation of would pat_would = re.compile("(?<=[a-zA-Z])\'d") # to find the abbreviation of will pat_will = re.compile("(?<=[a-zA-Z])\'ll") # to find the abbreviation of am pat_am = re.compile("(?<=[I|i])\'m") # to find the abbreviation of are pat_are = re.compile("(?<=[a-zA-Z])\'re") # to find the abbreviation of have pat_ve = re.compile("(?<=[a-zA-Z])\'ve") if isinstance(data, dict): for key, value in data.items(): if isinstance(value, str) and (key == "id" or key == "is_impossible"): continue elif isinstance(value, int) and key == "answer_start": continue elif isinstance(value, str): # value = "".join(tokenizer.tokenize(value)) # 通过正则去除标点,除'外 value = pat_letter.sub(' ', value).strip().lower() value = pat_is.sub(r"\1 is", value) value = pat_s.sub("", value) value = pat_s2.sub("", value) value = pat_not.sub(" not", value) value = pat_would.sub(" would", value) value = pat_will.sub(" will", value) value = pat_am.sub(" am", value) value = pat_are.sub(" are", value) value = pat_ve.sub(" have", value) value = value.replace('\'', ' ') words.extend(WordPunctTokenizer().tokenize(value)) else: data_to_words(value) elif isinstance(data, list): for i in data: if isinstance(i, str): i = "".join(tokenizer.tokenize(i)) i = pat_letter.sub(' ', i).strip().lower() i = pat_is.sub(r"\1 is", i) i = pat_s.sub("", i) i = pat_s2.sub("", i) i = pat_not.sub(" not", i) i = pat_would.sub(" would", i) i = pat_will.sub(" will", i) i = pat_am.sub(" am", i) i = pat_are.sub(" are", i) i = pat_ve.sub(" have", i) i = i.replace('\'', ' ') words.extend(WordPunctTokenizer().tokenize(i)) else: data_to_words(i) else: # print("{}".format(data)) pass return words
import nltk from nltk.tree import Tree import os.path from PreProcessing import parsers #edit this when changind dirs LangPaths = os.path.realpath( "C:/users/rihanna/Documents/Pol/ThesisIt/SumMe/Summarizer/langdetector/profiles/" ) tltagger = nltk.data.load("taggers/filipino_aubt.pickle") #filipino pos tagger tlChunker = nltk.data.load( "chunkers/filipino_ub.pickle") #filipino chunker here enChunker = nltk.data.load("chunkers/conll2000_ub.pickle") #enChunkerhere punkt_param = PunktParameters() #creates an opening for tokenizer parameters. punkt_param.abbrev_types = set(['gng', 'mr', 'mrs', 'dr', 'rep' ]) #abbreviations further accepted goes here sentence_splitter = PunktSentenceTokenizer(punkt_param) tokenized = "" gateway = JavaGateway() detector = gateway.entry_point detector.init(LangPaths) def LangDetect(str): return detector.detect(str) def tokenizer(str):
def get_pdf_objects(filename, table_detect=True): # pylint: disable=too-many-locals """extract body, table, table images from pdf """ body, tables = [], [] pages = fitz.open(filename) page_images, page_image_data = pdf_to_image(filename) prev_caption = None for i, page in enumerate(pages): ratio = page_images[i].shape[0] / page.rect[3] page_dict = get_pdf_page_dict(page, ratio) pred_table_boxes = find_tables( page_image_data[i]) if table_detect else [] page_tables = table_post_process(page_dict, pred_table_boxes, prev_caption) prev_caption = page_tables[-1]['caption'] if page_tables else None # seperate body blocks and table blocks table_blocks = [[] for _ in page_tables] for block in page_dict['blocks']: if block['type'] == 1: continue for j, table in enumerate(page_tables): if (not table['continued'] and overlap_ratio( block['bbox'], table['caption']['bbox']) > 0.5): break elif overlap_ratio(block['bbox'], table['bbox']) > 0.5: table_blocks[j].append(block) break else: body += get_lines(block) # construct table for j, (blocks, table) in enumerate(zip(table_blocks, page_tables)): table['cells'] = construct_table(blocks) # crop table images for table in page_tables: x1, y1, x2, y2 = table['bbox'] image = page_images[i][y1:y2, x1:x2, :] if image.size == 0: continue image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) img_data = cv2.imencode( '.jpg', image, [int(cv2.IMWRITE_JPEG_QUALITY), 75])[1].tostring() table['image'] = img_data tables += page_tables # sentence tokenize body text body = ' '.join(map(clean_text, body)) punkt_param = PunktParameters() punkt_param.abbrev_types = set(['fig']) body = list(PunktSentenceTokenizer(punkt_param).tokenize(body)) body = split_sents(body) return body, tables
def __init__(self): punkt_param = PunktParameters() punkt_param.abbrev_types = set(['e.g', 'ie', 'i.e', 'eg']) super().__init__(punkt_param)
def sent_pos(in_dir): """ Positions of citation markers in sentences, relatve to where in doc """ punkt_param = PunktParameters() abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) with open('hedge_words') as f: hedge_words = [l.strip() for l in f.readlines()] x = [] y = [] file_names = os.listdir(in_dir) buckets = [] for foo in range(10): buckets.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) for file_idx, fn in enumerate(file_names): if file_idx % 100 == 0: print('{}/{}'.format(file_idx, len(file_names))) path = os.path.join(in_dir, fn) aid, ext = os.path.splitext(fn) if ext != '.txt' or aid == 'log': continue with open(path) as f: text = f.read() text = re.sub(E_G_PATT, 'e.g.', text) # annot_fn = '{}_annot.json'.format(aid) # annot_path = os.path.join(in_dir, annot_fn) # if not os.path.isfile(annot_path): # continue # with open(annot_path) as f: # annots = json.load(f) marker = ' \u241F ' doc_len = len(text) # ↓ word wise for sent_idx, sent_edx in tokenizer.span_tokenize(text): sentence_orig = text[sent_idx:sent_edx] sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig) sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence) # determine contained annotations # annotated_words = [] # for annot in annots: # start = annot[0] # end = annot[1] # dbp_id = annot[2] # annot_len = end - start # in_sent_idx = start - sent_idx # if start >= sent_idx and end <= sent_edx: # disp = sentence_orig[in_sent_idx:in_sent_idx+annot_len] # annotated_words.append(disp) if marker in sentence: doc_pos = 1 - (sent_idx / doc_len) buck_y_idx = math.floor(doc_pos * 10) if buck_y_idx == 10: buck_y_idx = 9 words = pos_tag(sentence.split()) words = [w for w in words if re.search(r'[\w|\u241F]', w[0])] sent_len = len(words) sent_tags_str = ' '.join([tup[1] for tup in words]) indices = [ i for i, tup in enumerate(words) if tup[0] == marker.strip() ] # if 'JJS' not in sent_tags_str: # continue for word_idx in indices: word = words[word_idx][0] # if word == marker.strip() and \ # words[word_idx-1][1] == 'IN': # if word == marker.strip() and \ # ((word_idx > 0 and \ # 'FORMULA' not in words[word_idx-1][0] and \ # words[word_idx-1][1] in ['NNP', 'NNPS']) or \ # (word_idx > 1 and \ # words[word_idx-1][1] in ['NN', 'NNS'] and \ # 'FORMULA' not in words[word_idx-2][0] and \ # words[word_idx-2][1] in ['NNP', 'NNPS'])): # if word == marker.strip() and \ # (word_idx > 0 and \ # words[word_idx-1][0] in annotated_words and \ # words[word_idx-1][1] in ['NNP', 'NNPS']): # if word == marker.strip() and \ # word_idx+1 < len(words) and \ # 'VB' in words[word_idx+1][1]: if word == marker.strip(): # print(words) # print('doc pos: {}'.format((sent_idx/doc_len))) # print('sent pos: {}/{}'.format((word_idx+1),sent_len)) # input() sent_pos = (word_idx + 1) / sent_len y.append(doc_pos) x.append(sent_pos) buck_x_idx = math.floor(sent_pos * 10) if buck_x_idx == 10: buck_x_idx = 9 buckets[buck_y_idx][buck_x_idx] += 1 # if file_idx > 1000: # break # # ↓ character wise # for sent_idx, sentence in enumerate(sentences): # # has_hw = False # # for hw in hedge_words: # # if hw in sentence: # # has_hw = True # # break # # if not has_hw: # # continue # sent_len = len(sentence) # doc_pos = 1 - (sent_idx/doc_len) # buck_y_idx = math.floor(doc_pos*10) # if buck_y_idx == 10: # buck_y_idx = 9 # for cit_mark in re.finditer(marker, sentence): # cm_idx = cit_mark.end() # sent_pos = cm_idx/sent_len # y.append(doc_pos) # x.append(sent_pos) # buck_x_idx = math.floor(sent_pos*10) # if buck_x_idx == 10: # buck_x_idx = 9 # buckets[buck_y_idx][buck_x_idx] += 1 print('normalized row distributions:') for line in buckets: print(' '.join(['{:.2f}'.format(x / sum(line)) for x in line])) plt.xlabel('citation marker position in sentence') plt.ylabel('sentence position in document') heatmap, xedges, yedges = np.histogram2d(x, y, bins=(50)) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm()) # plt.imshow(heatmap.T, extent=extent, origin='lower') plt.colorbar() plt.show() plt.clf() plt.xlabel('citation marker position in sentence') plt.ylabel('sentence position in document') heatmap, xedges, yedges = np.histogram2d(x, y, bins=(50)) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] # plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm()) plt.imshow(heatmap.T, extent=extent, origin='lower') plt.colorbar() plt.show()