def __init__(self): #patterns self._pat_abstract_1 = '(Abstract|ABSTRACT)' self._pat_abstract_2 = '(In this|This (paper|study|article|report)' self._pat_abstract_2 += '|IN THIS|THIS (PAPER|STUDY|ARTICLE|REPORT))' self._pat_abstract_end = '(!A_E!|$|\n\s*\n)' self._pat_keywords = 'I[nN][dD][eE][xX] ?[tT][eE][rR][mM][sS]|' self._pat_keywords += 'K[eE][yY] ?[wW][oO][rR][dD][sS](?: and [A-Za-z]*?)?|' self._pat_keywords += 'G[eE][nN][eE][rR][aA][lL] ?[tT][eE][rR][mM][sS]?' self.pat_months = 'janu?a?r?y?\.?|febr?u?a?r?y?\.?|marc?h?\.?|apri?l?\.?|' self.pat_months += 'may|june?\.?|july?\.?|augu?s?t?\.?|sept?e?m?b?e?r?\.?|' self.pat_months += 'octo?b?e?r?\.?|nove?m?b?e?r?\.?|dece?m?b?e?r?\.?' self.pat_junctions = 'the|be|a|an|anthe|of|in|at|for|from|to|into|and|or|with' self.pat_middle_name = '(ter|Ter|van|den|der|de|di|la|van der|von|chen|' self.pat_middle_name += 'van de|van den|Van|Den|Der|De|Di|La|Van der|Von|' self.pat_middle_name += 'Chen|Van de|Van den|el|El)' self.alone_name = \ '([eE]t[. ]*?al\.?|[Ss]ons?|[Jj]r[ .,]|[Jj]unior|[eE]tc\.?)' self.roman_num = '(I[ .]|II[ .]|III[ .]|IV[ .]|V[ .]|VI[ .]|VII[ .]|' self.roman_num += 'VIII[ .]|IX[ .]|X[ .])' #regular expressions self._re_abstract_1 = re.compile(self._pat_abstract_1 + '\W+(.+?)' + self._pat_abstract_end, re.DOTALL) self._re_abstract_2 = re.compile(self._pat_abstract_2 + '\W+(.+?)' + self._pat_abstract_end, re.DOTALL) self._re_keywords_1 = re.compile('(' + self._pat_keywords + ')[-:,;. ]*?\n+(.+?)\n') self._re_keywords_2 = re.compile('(' + self._pat_keywords + ')[-:,;. ]+(.+?)(\n|[^0-9A-Z]\.|$)') self._re_meta = re.compile('(^.*)(authors?:|emails?:|editors?:)', re.I) self.re_published = re.compile('(^|\s)published (in|with)', re.I) self.re_issn = re.compile('ISSN|[0-9]{4}-[0-9]{4}') self.re_rep = re.compile('(^\s*[a-z]{2,} report\s*$|tech report)', re.I) self.re_lower_start = re.compile('^[a-z][^A-Z-\'`]') self.re_vol = re.compile('vol(ume)?\.? *[0-9]+', re.I) self.re_rev = re.compile('[Rr]evision *[0-9]+') self.re_inproc = \ re.compile('in proc\.|in proceedings|proceedings of', re.I) self.re_no = re.compile('n(o|umber)\.? *[0-9]+', re.I) self.re_pages = re.compile('(pages?|p\.) [0-9]', re.I) self.re_etal = re.compile('et al\.?(\s|$)', re.I) self.re_copyright = re.compile('(^|\s)copyright(\s|$)', re.I) self.re_date = re.compile('(' + self.pat_months + ') .*?[0-9]{4}', re.I) self.re_year = re.compile('\([0-9]{4}\)') self.re_num = re.compile('^[0-9\s%]+$', re.DOTALL) self.re_noletter = re.compile('^[^a-z]+$', re.IGNORECASE) self.re_upper_word = re.compile('[A-Z][A-Z]+') self.re_lower_word = re.compile('[a-z]') self.re_telfax = re.compile('(tel|fax) +[+]?[0-9][-0-9 ()]', re.I) self.re_empty = re.compile('^\s*$', re.DOTALL) self.re_notitle = re.compile('\s(conference|symposium)\s', re.M | re.I) self.re_title = re.compile('[A-Z][A-Z]+( [A-Z][A-Z])* [0-9]{4}') self.re_mark = \ re.compile('(^[^\s]+ ?(/|-[0-9]) ?[^\s]+$|[0-9]+[-:][0-9]+)') self.re_type = \ re.compile('(^|\s)thesis|article|journal(\s|$)', re.I | re.DOTALL) self.re_organization = \ re.compile('department of|university|school of', re.IGNORECASE) self.re_press = \ re.compile('(^|[^0-9a-z])in press([^0-9a-z]|$)', re.IGNORECASE) self.re_domain = re.compile('.+\.[a-z]{2,3}$', re.IGNORECASE) self.re_zav = re.compile('^[[(].*[])]$') self.re_title = re.compile('^.*?T[Ii][Tt][Ll][Ee]:\s*(.+?)(\.|$)') self.re_authors = re.compile('^[ ,.&]*(' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(' #Jmeno von Prijmeni '(([A-Z][-A-Za-z\'´`]+)[.,]?[ ]+)' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(([A-Z][-A-Za-z\'´`]+)[ ]*)' '|' #Prijmeni, J. '(([A-Z][-A-Za-z\'´`]+[.,]?)[ ]+)' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '((' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z][ .]?)[ ]*)+?' '|' #J. von Prijmeni '((' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)+?' '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?' '(([A-Z][-A-Za-z\'´`]+)[ ]*)' ')' '(([A-Z]v+|' + self.pat_middle_name + '|([A-Z]\.?-)?[A-Z]\.)[ ]*)*?' '(([., ]*(' + self.alone_name + '|' + self.roman_num + ')?)*' '((?<=\.) |&|[,. ]?and[,. ]|,|\.|;|$|%SEP%|\())+?' '([., ]*' + self.alone_name + ')*)', re.VERBOSE) self.re_inc = re.compile('[^a-zA-Z]([A-Z]\.|' + self.pat_middle_name + '|' + self.alone_name + '|&)[^a-zA-Z]') self.re_end = re.compile('([-:,]| (' + self.pat_junctions + '))\s*\n', re.IGNORECASE) self.re_split = \ re.compile('^[A-Z]+( [^ a-z]+)*\n([^ a-z]+( [^ a-z]+)*\n)+') self.re_upper = re.compile('([A-Z][A-Z]+ ){3}') self.re_lower = re.compile('(^.+?) (.[^ ]?[a-z].*$)') self.re_by = re.compile('(^.+) by (.+?)$', re.DOTALL) self.re_autinline = \ re.compile('^([A-Z][^A-Z]+( [^A-Z]+){3}.*?)([A-Z][.a-z].*)') self.re_word_end = re.compile(' [^A-Z]+\s*$') self.re_term = re.compile('(^|\s*[:,;]+)\s*(.+?)\s*([:,;]+\s*|$)') #Dictionaries try: self.rrsdict_locations = RRSDictionary(COUNTRIES, FIRST_UPPER) self.rrsdict_locations.extend(RRSDictionary(CITIES, FIRST_UPPER)) except RRSDictionaryError: raise DictionaryError("Failed to load dictionaries.")
def __init__(self): self.pat_month = "january|february|march|april|may|june|july|august|" self.pat_month += "september|october|november|december|jan\.?|feb\.?|" self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|" self.pat_month += "nov\.?|dec\.?" self.pat_date = "(" + self.pat_month + \ " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})" self.kws_unpublished = { "introduction": 50, "abstract": 50, "related work": 50 } self.kws_article = { "(vol\.?|volume)\s*[0-9]+": 20, self.pat_date: 20, "(pages?|pp?\.?)\s*[-0-9]+(?!\))": 20, "(number|no\.?)\s*[_0-9]+": 20, "copyright": 20, "all rights reserved": 20, "journal": 20, "is published": 20, "published in": 20, "first published": 20, "in press": 20, "introduction": 50, "abstract": 50, "related work": 50, self._find_events: 50, self._find_events_2: 20 } self.kws_techreport = { "this report": 200, "tech[a-z]+ report": 200, "summary report": 100, "is (a )?report": 80 } self.kws_phdthesis = { "supervisor": 100, "this thesis": 200, "dissertation": 200, "Ph\.?D thesis": 200 } self.kws_masterthesis = { "supervisor": 100, "this thesis": 200, "master thesis": 200, "master\W?s thesis": 200 } self.types = { UNPUBLISHED: self.kws_unpublished, ARTICLE: self.kws_article, PHDTHESIS: self.kws_phdthesis, MASTERTHESIS: self.kws_masterthesis, TECHREPORT: self.kws_techreport } self.re_proceedings = re.compile('\W(proceedings|conference)\W', re.DOTALL) self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE) self.re_oo_impress = re.compile('impress', re.IGNORECASE) self.re_pages = re.compile('Pages:\s*([0-9]+)') self.pat_chapters = "R(eferences|EFERENCES)" self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W', re.DOTALL) #Proceedings articles patterns and RE self.pat_time = "[0-2]?[0-9]:[0-9][0-9]" self.pat_toc_page = "(\.\s*){2,}[0-9]+\n" self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)" self.pat_roman_nums = self.pat_pagesep + \ "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \ self.pat_pagesep self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \ self.pat_pagesep + ")" self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \ self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)" self.re_proceedings_prefix_strict = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end_strict + "|" + \ self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE) self.re_proceedings_prefix = \ re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \ ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \ "))", re.DOTALL | re.IGNORECASE) self.re_abstract = \ re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n", re.DOTALL | re.IGNORECASE) self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)", re.DOTALL | re.IGNORECASE) self.re_multi_strict = \ re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \ self.pat_page_end_strict + ')', re.DOTALL) self.re_previous_1 = \ re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \ '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \ '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \ self.pat_page_end + ')', re.DOTALL) self.re_previous_2 = \ re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \ 'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \ 'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])', re.DOTALL) self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE)