def detect(self, start_position, end_position): """Detects upper case formatted heading of text to identify the end. Args: start_position (int): raw start position of the context end_position (int): raw end position of the context Returns: int: position of the identified subject context end """ # velka pismena capitals_pattern = '[A-ZĚŠČŘŽÝÁÍÉÚŮŇŤÓĎ ]{4,}' start_line = get_current_line(self._text, start_position) start_match = re.search(capitals_pattern, start_line) if start_match and (len(start_match.group(0)) / len(start_line) > 0.5): new_start_position = start_position + len(start_line) end_occurrences = find_all_occurrences_in_string( capitals_pattern, self._text[new_start_position:end_position]) if len(end_occurrences) > 0: for occ in end_occurrences: occ = new_start_position + occ end_line = get_current_line(self._text, occ) end_match = re.search(capitals_pattern, end_line) if end_match and (len(end_match.group(0)) / len(end_line) > 0.5): self._end_position = occ return self._end_position return None
def detect(self, start_position, end_position): """Searches for a specific heading keyword of name parameter to identify the end. Args: start_position (int): raw start position of the context end_position (int): raw end position of the context Returns: int: position of the identified subject context end """ # pouze nazev article_pattern = 'název' start_occurrences = find_all_occurrences_in_string( article_pattern, self._text[max(start_position - 50, 0):min(start_position + 50, len(self._text))]) if len(start_occurrences) > 0: end_occurrences = find_all_occurrences_in_string( '\n', self._text[start_position:end_position]) if len(end_occurrences) > 0: for occ in end_occurrences: occ = start_position + occ if len(get_current_line(self._text, occ)) > 30: self._end_position = occ return self._end_position return None
def detect(self, start_position, end_position): """Searches for a specific heading keyword of sections to identify the end. Args: start_position (int): raw start position of the context end_position (int): raw end position of the context Returns: int: position of the identified subject context end """ # clanek header subj_prefix = self._text[max(start_position - 50, 0):start_position] article_pattern = 'článek' start_occurrences = find_all_occurrences_in_string( article_pattern, subj_prefix) if len(start_occurrences) > 0: end_occurrences = find_all_occurrences_in_string( article_pattern, self._text[start_position:end_position]) if len(end_occurrences) > 0: for occ in end_occurrences: occ = start_position + occ if len(get_current_line(self._text, occ)) < 50: self._end_position = occ return self._end_position return None
def detect(self, start_position, end_position): """Detects the roman numbering of sections to identify the end. Args: start_position (int): raw start position of the context end_position (int): raw end position of the context Returns: int: position of the identified subject context end """ # rimske cislovani subj_prefix = self._text[max(start_position - 50, 0):start_position] roman_numeral_pattern = '\s(?=[XVI])(X{0,3})(I[XV]|V?I{0,3})[\s\W]+' start_occurrences = find_all_occurrences_in_string( roman_numeral_pattern, subj_prefix, lower=False) if len(start_occurrences) > 0: end_occurrences = find_all_occurrences_in_string( roman_numeral_pattern, self._text[start_position:end_position], lower=False) if len(end_occurrences) > 0: for occ in end_occurrences: occ = start_position + occ if len(get_current_line(self._text, occ + 1)) < 50: self._end_position = occ return self._end_position return None
def get_all_occurrences(self, text): """Finds all occurrences of keywords and computes their rating. Uses member keywords with their weights to initialize the rating of each of them. Uses local characteristics to accumulate the rating coefficient. Result keyword rating is the default rating multiplied by the coefficient. Args: text (str): text to find the keywords in Returns: list: list of all occurrences represented by a dictionary containing the keyword, rating and occurrence position """ occurrences = [] for keyword in self._keywords: occ = find_all_occurrences_in_string(keyword, text) for o in occ: rat = self._keywords[keyword] koef = 1 matched = keyword.lower() current_line = get_current_line(text, o) # Whole line if current_line.lower() == keyword.lower(): koef += 2 # Exact pattern match if text[o:min(o + len(keyword), len(text))] == keyword: koef += 1.5 matched = keyword # Upper case pattern match if text[o:min(o + len(keyword), len(text))] == keyword.upper(): koef += 1.5 matched = keyword.upper() # Nearly linebreak after the pattern (chapter title) if '\n' in text[o:min(o + len(keyword) * 3, len(text))]: koef += 2 # Newline followed by a number preceding the pattern (chapter numbering) if re.search(r"\n[ ]*[0-9]", text[max(o - 20, 0):o]): koef += 2 # Nearly verb ' je ' after the pattern (subject sentence matching) if ' je ' in text[o:min(o + len(keyword) * 2, len(text))]: koef += 2 # Word 'článek' preceding the pattern (chapter header) if 'článek' in text[max(o - 20, 0):o].lower(): koef += 2 # Chars 'I' preceding the pattern (chapter numbering) if text[max(o - 20, 0):o].count('I') > 1: koef += 2 # Simple sentences following koef += chars_occurrence_ratio( text[min(o + 50, len(text)):min(o + 100, len(text))]) # Nearly noun 'Zbozi' after the pattern () if 'Zboží' in text[o:min(o + 150, len(text))]: koef *= 0.5 rat *= koef occurrences.append({'keyword': matched, 'rat': rat, 'occ': o}) return occurrences
def detect(self, start_position, end_position): """Detects the numbering of sections to identify the end. Args: start_position (int): raw start position of the context end_position (int): raw end position of the context Returns: int: position of the identified subject context end """ # klasicke cislovani \n<num>. word subj_prefix = self._text[max(start_position - 50, 0):start_position] numeral_pattern = '\n[ \t]*[\d]+[^/]' start_occurrences = find_all_occurrences_in_string(numeral_pattern, subj_prefix, lower=False) if len(start_occurrences) > 0: # otocim text abych hledal prechazejici cislo, najdu cislo nasledujici newline a vratim ho article_num = int( re.search( '[\d]+', re.search('[^/][\d]+[ \t]*\n', subj_prefix[::-1]).group(0)).group(0)[::-1]) end_occurrences = find_all_occurrences_in_string( numeral_pattern, self._text[start_position:end_position], lower=False) if len(end_occurrences) > 0: for occ in end_occurrences: occ = start_position + occ m = re.search('[\d]+', self._text[occ:occ + 10]) if m is None: continue num = int(m.group(0)) if (num > article_num) and (num < article_num + 3): current_line = get_current_line(self._text, occ + 1) if len(current_line) < 50: num_numbers_in_line = len( find_all_occurrences_in_string( '\d', current_line)) if num_numbers_in_line <= 5: self._end_position = occ return self._end_position return None
def detect(self, start_position, end_position): """Searches for special keywords to identify the end. Args: start_position (int): raw start position of the context end_position (int): raw end position of the context Returns: int: position of the identified subject context end """ end_words = ['Cena', 'Doba', 'Místo'] for word in end_words: end_occurrences = find_all_occurrences_in_string( word, self._text[start_position:end_position]) if len(end_occurrences) > 0: for occ in end_occurrences: occ = start_position + occ if len(get_current_line(self._text, occ)) < 50: self._end_position = occ return self._end_position