def __init__(self):
        #patterns
        self._pat_abstract_1 = '(Abstract|ABSTRACT)'
        self._pat_abstract_2 = '(In this|This (paper|study|article|report)'
        self._pat_abstract_2 += '|IN THIS|THIS (PAPER|STUDY|ARTICLE|REPORT))'
        self._pat_abstract_end = '(!A_E!|$|\n\s*\n)'
        self._pat_keywords = 'I[nN][dD][eE][xX] ?[tT][eE][rR][mM][sS]|'
        self._pat_keywords += 'K[eE][yY] ?[wW][oO][rR][dD][sS](?: and [A-Za-z]*?)?|'
        self._pat_keywords += 'G[eE][nN][eE][rR][aA][lL] ?[tT][eE][rR][mM][sS]?'
        self.pat_months = 'janu?a?r?y?\.?|febr?u?a?r?y?\.?|marc?h?\.?|apri?l?\.?|'
        self.pat_months += 'may|june?\.?|july?\.?|augu?s?t?\.?|sept?e?m?b?e?r?\.?|'
        self.pat_months += 'octo?b?e?r?\.?|nove?m?b?e?r?\.?|dece?m?b?e?r?\.?'
        self.pat_junctions = 'the|be|a|an|anthe|of|in|at|for|from|to|into|and|or|with'
        self.pat_middle_name = '(ter|Ter|van|den|der|de|di|la|van der|von|chen|'
        self.pat_middle_name += 'van de|van den|Van|Den|Der|De|Di|La|Van der|Von|'
        self.pat_middle_name += 'Chen|Van de|Van den|el|El)'
        self.alone_name = \
            '([eE]t[. ]*?al\.?|[Ss]ons?|[Jj]r[ .,]|[Jj]unior|[eE]tc\.?)'
        self.roman_num = '(I[ .]|II[ .]|III[ .]|IV[ .]|V[ .]|VI[ .]|VII[ .]|'
        self.roman_num += 'VIII[ .]|IX[ .]|X[ .])'

        #regular expressions
        self._re_abstract_1 = re.compile(self._pat_abstract_1 + '\W+(.+?)'
                                       + self._pat_abstract_end, re.DOTALL)
        self._re_abstract_2 = re.compile(self._pat_abstract_2 + '\W+(.+?)'
                                       + self._pat_abstract_end, re.DOTALL)
        self._re_keywords_1 = re.compile('(' + self._pat_keywords
                                         + ')[-:,;. ]*?\n+(.+?)\n')
        self._re_keywords_2 = re.compile('(' + self._pat_keywords
                                         + ')[-:,;. ]+(.+?)(\n|[^0-9A-Z]\.|$)')
        self._re_meta = re.compile('(^.*)(authors?:|emails?:|editors?:)', re.I)
        self.re_published = re.compile('(^|\s)published (in|with)', re.I)
        self.re_issn = re.compile('ISSN|[0-9]{4}-[0-9]{4}')
        self.re_rep = re.compile('(^\s*[a-z]{2,} report\s*$|tech report)', re.I)
        self.re_lower_start = re.compile('^[a-z][^A-Z-\'`]')
        self.re_vol = re.compile('vol(ume)?\.? *[0-9]+', re.I)
        self.re_rev = re.compile('[Rr]evision *[0-9]+')
        self.re_inproc = \
            re.compile('in proc\.|in proceedings|proceedings of', re.I)
        self.re_no = re.compile('n(o|umber)\.? *[0-9]+', re.I)
        self.re_pages = re.compile('(pages?|p\.) [0-9]', re.I)
        self.re_etal = re.compile('et al\.?(\s|$)', re.I)
        self.re_copyright = re.compile('(^|\s)copyright(\s|$)', re.I)
        self.re_date = re.compile('(' + self.pat_months + ') .*?[0-9]{4}', re.I)
        self.re_year = re.compile('\([0-9]{4}\)')
        self.re_num = re.compile('^[0-9\s%]+$', re.DOTALL)
        self.re_noletter = re.compile('^[^a-z]+$', re.IGNORECASE)
        self.re_upper_word = re.compile('[A-Z][A-Z]+')
        self.re_lower_word = re.compile('[a-z]')
        self.re_telfax = re.compile('(tel|fax) +[+]?[0-9][-0-9 ()]', re.I)
        self.re_empty = re.compile('^\s*$', re.DOTALL)
        self.re_notitle = re.compile('\s(conference|symposium)\s', re.M | re.I)
        self.re_title = re.compile('[A-Z][A-Z]+( [A-Z][A-Z])* [0-9]{4}')
        self.re_mark = \
            re.compile('(^[^\s]+ ?(/|-[0-9]) ?[^\s]+$|[0-9]+[-:][0-9]+)')
        self.re_type = \
            re.compile('(^|\s)thesis|article|journal(\s|$)', re.I | re.DOTALL)
        self.re_organization = \
            re.compile('department of|university|school of', re.IGNORECASE)
        self.re_press = \
            re.compile('(^|[^0-9a-z])in press([^0-9a-z]|$)', re.IGNORECASE)
        self.re_domain = re.compile('.+\.[a-z]{2,3}$', re.IGNORECASE)
        self.re_zav = re.compile('^[[(].*[])]$')
        self.re_title = re.compile('^.*?T[Ii][Tt][Ll][Ee]:\s*(.+?)(\.|$)')
        self.re_authors = re.compile('^[ ,.&]*('
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(' #Jmeno von Prijmeni
                    '(([A-Z][-A-Za-z\'´`]+)[.,]?[ ]+)'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(([A-Z][-A-Za-z\'´`]+)[ ]*)'
                    '|' #Prijmeni, J.
                    '(([A-Z][-A-Za-z\'´`]+[.,]?)[ ]+)'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '((' + self.pat_middle_name
                    + '[.,]?|([A-Z]\.?-)?[A-Z][ .]?)[ ]*)+?'
                    '|' #J. von Prijmeni
                    '((' + self.pat_middle_name
                    + '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)+?'
                    '(([A-Z][-A-Za-z\'´`]+[.,]?|' + self.pat_middle_name + 
                    '[.,]?|([A-Z]\.?-)?[A-Z]\.)[ ]+)*?'
                    '(([A-Z][-A-Za-z\'´`]+)[ ]*)'
                    ')'
                    '(([A-Z]v+|' + self.pat_middle_name
                    + '|([A-Z]\.?-)?[A-Z]\.)[ ]*)*?'
                    '(([., ]*(' + self.alone_name + '|'
                    + self.roman_num + ')?)*'
                    '((?<=\.) |&|[,. ]?and[,. ]|,|\.|;|$|%SEP%|\())+?'
                    '([., ]*' + self.alone_name + ')*)', re.VERBOSE)
        self.re_inc = re.compile('[^a-zA-Z]([A-Z]\.|' + self.pat_middle_name
                                 + '|' + self.alone_name + '|&)[^a-zA-Z]')
        self.re_end = re.compile('([-:,]| (' + self.pat_junctions + '))\s*\n',
                            re.IGNORECASE)
        self.re_split = \
            re.compile('^[A-Z]+( [^ a-z]+)*\n([^ a-z]+( [^ a-z]+)*\n)+')
        self.re_upper = re.compile('([A-Z][A-Z]+ ){3}')
        self.re_lower = re.compile('(^.+?) (.[^ ]?[a-z].*$)')
        self.re_by = re.compile('(^.+) by (.+?)$', re.DOTALL)
        self.re_autinline = \
            re.compile('^([A-Z][^A-Z]+( [^A-Z]+){3}.*?)([A-Z][.a-z].*)')
        self.re_word_end = re.compile(' [^A-Z]+\s*$')
        self.re_term = re.compile('(^|\s*[:,;]+)\s*(.+?)\s*([:,;]+\s*|$)')

        #Dictionaries
        try:
            self.rrsdict_locations = RRSDictionary(COUNTRIES, FIRST_UPPER)
            self.rrsdict_locations.extend(RRSDictionary(CITIES, FIRST_UPPER))
        except RRSDictionaryError:
            raise DictionaryError("Failed to load dictionaries.")
示例#2
0
    def __init__(self):
        self.pat_month = "january|february|march|april|may|june|july|august|"
        self.pat_month += "september|october|november|december|jan\.?|feb\.?|"
        self.pat_month += "mar\.?|apr\.?|jun\.?|jul\.?|aug\.?|sept?\.?|oct\.?|"
        self.pat_month += "nov\.?|dec\.?"
        self.pat_date = "(" + self.pat_month + \
                        " ([_0-9]+\.? )*)(19[0-9]{2}|2[0123][0-9]{2})"
        self.kws_unpublished = {
            "introduction": 50,
            "abstract": 50,
            "related work": 50
        }
        self.kws_article = {
            "(vol\.?|volume)\s*[0-9]+": 20,
            self.pat_date: 20,
            "(pages?|pp?\.?)\s*[-0-9]+(?!\))": 20,
            "(number|no\.?)\s*[_0-9]+": 20,
            "copyright": 20,
            "all rights reserved": 20,
            "journal": 20,
            "is published": 20,
            "published in": 20,
            "first published": 20,
            "in press": 20,
            "introduction": 50,
            "abstract": 50,
            "related work": 50,
            self._find_events: 50,
            self._find_events_2: 20
        }
        self.kws_techreport = {
            "this report": 200,
            "tech[a-z]+ report": 200,
            "summary report": 100,
            "is (a )?report": 80
        }
        self.kws_phdthesis = {
            "supervisor": 100,
            "this thesis": 200,
            "dissertation": 200,
            "Ph\.?D thesis": 200
        }
        self.kws_masterthesis = {
            "supervisor": 100,
            "this thesis": 200,
            "master thesis": 200,
            "master\W?s thesis": 200
        }
        self.types = {
            UNPUBLISHED: self.kws_unpublished,
            ARTICLE: self.kws_article,
            PHDTHESIS: self.kws_phdthesis,
            MASTERTHESIS: self.kws_masterthesis,
            TECHREPORT: self.kws_techreport
        }

        self.re_proceedings = re.compile('\W(proceedings|conference)\W',
                                         re.DOTALL)
        self.re_ms_powerpoint = re.compile('powerpoint', re.IGNORECASE)
        self.re_oo_impress = re.compile('impress', re.IGNORECASE)
        self.re_pages = re.compile('Pages:\s*([0-9]+)')

        self.pat_chapters = "R(eferences|EFERENCES)"
        self.re_chapters = re.compile('(^.*\W)(' + self.pat_chapters + ')\W',
                                      re.DOTALL)

        #Proceedings articles patterns and RE
        self.pat_time = "[0-2]?[0-9]:[0-9][0-9]"
        self.pat_toc_page = "(\.\s*){2,}[0-9]+\n"
        self.pat_pagesep = "(\s*\n\n\s*|\s*-\s*\n\n|\n\n\s*-\s*)"
        self.pat_roman_nums = self.pat_pagesep + \
             "(XX|XIX|XVIII|XVII|XVI|XV|XIV|XIII|XII|XI|X|IX|VIII|VII|VI|V|IV|III|II|I)" + \
            self.pat_pagesep
        self.pat_page_end_strict = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+" + \
                self.pat_pagesep + ")"
        self.pat_page_end = "(" + self.pat_pagesep + "[0-9]+" + \
            self.pat_pagesep + "|\npage [0-9]+\n|\.[0-9]+\n\n|\n\n)"
        self.re_proceedings_prefix_strict = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end_strict + "|" + \
                       self.pat_roman_nums + "))", re.DOTALL | re.IGNORECASE)
        self.re_proceedings_prefix = \
            re.compile("(^.+((" + self.pat_time + "|" + self.pat_toc_page + \
                       ").*" + self.pat_page_end + "|" + self.pat_roman_nums + \
                       "))", re.DOTALL | re.IGNORECASE)
        self.re_abstract = \
            re.compile("(^.+?)\n(abstract|references)(\W[^.\n]*)?\n",
                       re.DOTALL | re.IGNORECASE)
        self.re_proceedings = re.compile("(^.+?\Wproceedings\W.*?\n\n)",
                                         re.DOTALL | re.IGNORECASE)

        self.re_multi_strict = \
            re.compile('(^.{5000,}?\nR[Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]\W*\n.+?' + \
                       self.pat_page_end_strict + ')', re.DOTALL)
        self.re_previous_1 = \
            re.compile('(^\s*([A-Z]\.|[A-Z]\w+[,. ]+[A-Z]\.|' + \
                       '[^\n]+?[(.,]\s*[0-9]{4}\s*[.),][^\n]+?\n|' + \
                       '[^a-zA-Z]|[a-z]|[A-Z]\w*?\.?\n|[^\n]*\Weds\W).*?' + \
                       self.pat_page_end + ')', re.DOTALL)
        self.re_previous_2 = \
            re.compile('^\s*(Introduction|INTRODUCTION|Appendix|APPENDIX|' + \
                       'Table\W+[0-9]|TABLE\W+[0-9]|Figure\W+[0-9]|' + \
                       'FIGURE\W+[0-9]|Section\W+[0-9]|SECTION\W+[0-9])',
                       re.DOTALL)

        self.rrsdict_events = RRSDictionary(EVENT_ACRONYMS, CASE_SENSITIVE)