Пример #1
0
    def from_string(cls, string):
        first = named('first', some)
        middle = named('middle', some)
        last = named('last', some)

        patterns = [
            last + ',\s+' + first + '\s+' + middle + end,  # Auden, Wystan Hugh
            last + ',\s+' + first + end,  # Auden, Wystan
            first + '\s+' + middle + '\s+' + last + end,  # Wystan Hugh Auden
            first + '\s+' + last + end,  # Wystan Auden
        ]
        for pattern in patterns:
            match = re.match(pattern, string)
            if match:
                groups = match.groupdict()
                return cls(groups['first'], groups.get('middle'), groups['last'])

        raise Exception('Cannot parse name: "%s"' % string)
Пример #2
0
import re
import requests
from xdoc.dom import Author, Reference
from xdoc.lib.regex import named, maybe, anything, some, s, sep, end
from xdoc.formats.tex.bibliography import parse_bibtex
from unidecode import unidecode

from xdoc.lib.log import logging
logger = logging.getLogger(__name__)


# \((\d{4}\w?,?)+\)/
# re_authors = r'(?P<authors>.+?)\s*'
# re_authors_editors = r'(?P<authors>.+?)\s*(?P<editor>\(ed(itor)?s\.?\)\s+)?\s*'
re_editors = r'(?P<editor>.+?)\s*\(ed(itor)?s?\.?\)\s*'
re_year = named('year', '\d{4}') + named('subyear', r'\w?')
re_title = r'(?P<title>[^.]+)\.\s*'
re_title_i = r'(?P<title>.+?)[.,]?\s*'
# \u2013 is the em-dash
re_page = ur'(?P<page_begin>\d+)(-|--|\u2013)(?P<page_end>\d+)'
# :?\s*' + re_page + '
re_vol = r'(Volume\s+)?(?P<volume>\d+(\.\d+)?)'
re_edition = r'\((?P<edition>\d+)\)'
re_pub_address = r'(?P<publisher>[^,]+)([.,]|, (?P<address>.*[^.])[.,]?)\s*'
re_doi = r'(http://dx.doi.org/(?P<doi>\S+))?'


media_regex = [{
    # Horn, Larry. 1972. On the semantic properties of logical operators in English: UCLA dissertation.
    'medium': 'phdthesis',
    'pattern':