示例#1
0
def get_fields(strng, strict=False):
    """
  Returns a list with pairs (field, value) from strng
  If strict is True, it will only allow known fields, defined in helper.bibtexfields
  """

    comma_rex = re.compile(r'\s*[,]')
    ss = strng.strip()

    if not ss.endswith(','):  # Add the last commma if missing
        ss += ','

    fields = []

    while True:
        name, sep, ss = ss.partition('=')
        name = name.strip().lower(
        )  # This should be enough if there is no error in the entry
        if len(
                name.split()
        ) > 1:  # Help recover from errors. name should be only one word anyway
            name = name.split()[-1]
        ss = ss.strip()
        if sep == '': break  # We reached the end of the string

        if ss[0] == '{':  # The value is surrounded by '{}'
            s, e = helper.match_pair(ss)
            data = ss[s + 1:e - 1].strip()
        elif ss[0] == '"':  # The value is surrounded by '"'
            s = ss.find(r'"')
            e = ss.find(r'"', s + 1)
            data = ss[s + 1:e].strip()
        else:  # It should be a number or something involving a string
            e = ss.find(',')
            data = ss[0:e].strip()
            if not data.isdigit():  # Then should be some string
                dd = data.split('#')  # Test for joined strings
                if len(dd) > 1:
                    for n in range(len(dd)):
                        dd[n] = dd[n].strip()
                        dd[n] = dd[n].replace('{', '"').replace('}', '"')
                        if dd[n][0] != '"':
                            dd[n] = 'definitionofstring(%s) ' % (dd[n])
                    data = '#'.join(dd)
                else:
                    data = 'definitionofstring(%s) ' % (data.strip())
        s = ss[e].find(',')
        ss = ss[s + e + 1:]
        # JF: Temporario, descomentar si hay problemas
        #     if name=='title':
        #       data=helper.capitalizestring(data)
        #     else:
        #       data=helper.removebraces(data)
        if not strict or name in helper.bibtexfields:
            fields.append((name, data))
    return fields
示例#2
0
def parsedata(data):
    """
  Parses a string with a bibtex database
  """
    # Regular expressions to use
    pub_rex = re.compile(
        '\s?@(\w*)\s*[{\(]')  # A '@' followed by any word and an opening
    # brace or parenthesis
    ########################################################################################
    #################### Reformat the string ####################
    ss = re.sub('\s+', ' ', data).strip()

    # Find entries
    strings = {}
    preamble = []
    comment = []
    tmpentries = []
    entries = {}

    while True:
        entry = {}
        m = pub_rex.search(ss)

        if m == None:
            break

        if m.group(0)[-1] == '(':
            d = helper.match_pair(ss, pair=('[(]', '[)]'), start=m.end() - 1)
        else:
            d = helper.match_pair(ss, start=m.end() - 1)

        if d != None:
            current = ss[m.start():d[1] - 1]  # Currently analyzed entry
            st, entry = parseentry(current)
            if st != None:
                strings.update(st)
            if entry != None and entry != {}:
                entries[entry['_code']] = entry
            ss = ss[d[1] + 1:].strip()

    return strings, entries
示例#3
0
def get_fields(strng, strict=False):
  """
  Returns a list with pairs (field, value) from strng
  If strict is True, it will only allow known fields, defined in helper.bibtexfields
  """ 

  comma_rex=re.compile(r'\s*[,]')
  ss=strng.strip()
  
  if not ss.endswith(','): # Add the last commma if missing
    ss+=','
    
  fields=[]

  while True:
    name,sep,ss= ss.partition('=')
    name=name.strip().lower()  # This should be enough if there is no error in the entry
    if len(name.split()) > 1:   # Help recover from errors. name should be only one word anyway
      name= name.split()[-1]
    ss=ss.strip()
    if sep == '': break  # We reached the end of the string

    if ss[0] == '{':    # The value is surrounded by '{}'
      s,e= helper.match_pair(ss)
      data= ss[s+1:e-1].strip()
    elif ss[0] == '"':  # The value is surrounded by '"'
      s= ss.find(r'"')
      e= ss.find(r'"',s+1)
      data= ss[s+1:e].strip()
    else: # It should be a number or something involving a string
      e= ss.find(',')
      data= ss[0:e].strip()
      if not data.isdigit(): # Then should be some string
        dd=data.split('#')  # Test for joined strings
        if len(dd) > 1:
          for n in range(len(dd)):
            dd[n]= dd[n].strip()
            dd[n]= dd[n].replace('{','"').replace('}','"')
            if dd[n][0] != '"':
              dd[n]='definitionofstring(%s) '%(dd[n])
          data='#'.join(dd)
        else:
          data='definitionofstring(%s) '%(data.strip())
    s=ss[e].find(',')
    ss=ss[s+e+1:]
# JF: Temporario, descomentar si hay problemas
#     if name=='title':
#       data=helper.capitalizestring(data)
#     else:
#       data=helper.removebraces(data)
    if not strict or name in helper.bibtexfields:
      fields.append((name,data))
  return fields
示例#4
0
def parsedata(data):
  """
  Parses a string with a bibtex database
  """
  # Regular expressions to use
  pub_rex = re.compile('\s?@(\w*)\s*[{\(]') # A '@' followed by any word and an opening
                                             # brace or parenthesis
  ########################################################################################
              #################### Reformat the string ####################
  ss= re.sub('\s+',' ',data).strip()

  # Find entries
  strings={}
  preamble=[]
  comment=[]
  tmpentries=[]
  entries={}

  while True:
    entry={}
    m= pub_rex.search(ss)

    if m == None:
      break

    if m.group(0)[-1]=='(':
      d= helper.match_pair(ss,pair=('[(]','[)]'),start=m.end()-1)
    else:
      d= helper.match_pair(ss,start=m.end()-1)

    if d != None:
      current= ss[m.start():d[1]-1]  # Currently analyzed entry
      st,entry= parseentry(current)
      if st != None:
        strings.update(st)
      if entry != None and entry != {}:
        entries[entry['_code']]= entry
      ss=ss[d[1]+1:].strip()

  return strings,entries
示例#5
0
def get_fields(strng):
    f = strng.find('=')
    braces_rex = re.compile(r'\s*[{]')
    comilla_rex = re.compile(r'\s*["]')
    start = 0
    fields = []
    end = len(strng)

    # start holds the current position in the strng
    # f :  position of equal sign
    # s :  position of {, opening " or first line after the equal sign
    # e :  position of closing }, " or next comma
    while f != -1 and start < end:
        name = string.strip(strng[start:f]).lower()

        if name != '':
            ss = strng[f + 1:]
            if braces_rex.match(ss):
                s, e = match_pair(ss)
                data = ss[s + 1:e - 1].strip()
            elif comilla_rex.match(ss):
                s = string.find(ss, r'"')
                e = string.find(ss, r'"', s + 1)
                data = ss[s + 1:e].strip()
            else:
                s = 1
                e = ss.find(',')
                data = ss[s:e].strip()

            fields.append((name, data))

            #  There is trailing comma, we should take it out
            e = ss.find(',', e) + 1

        start = f + e + 2
        f = string.find(strng, '=', start)
    return fields
示例#6
0
def get_fields(strng):
    f = strng.find('=')
    braces_rex = re.compile(r'\s*[{]')
    comilla_rex = re.compile(r'\s*["]')
    start = 0
    fields = []
    end = len(strng)

    # start holds the current position in the strng
    # f :  position of equal sign
    # s :  position of {, opening " or first line after the equal sign
    # e :  position of closing }, " or next comma
    while f != -1 and start < end:
        name = string.strip(strng[start:f]).lower()

        if name != '':
            ss = strng[f + 1:]
            if braces_rex.match(ss):
                s, e = match_pair(ss)
                data = ss[s + 1:e - 1].strip()
            elif comilla_rex.match(ss):
                s = string.find(ss, r'"')
                e = string.find(ss, r'"', s + 1)
                data = ss[s + 1:e].strip()
            else:
                s = 1
                e = ss.find(',')
                data = ss[s:e].strip()

            fields.append((name, data))

            #  There is trailing comma, we should take it out
            e = ss.find(',', e) + 1

        start = f + e + 2
        f = string.find(strng, '=', start)
    return fields
示例#7
0
def bibtexload(filecontents_source):
    space_rex = re.compile('\s+')
    pubtype_rex = re.compile('\W?@(\w*)\s*{\s*([^,]*),')
    pub_rex = re.compile('\W?@(\w*)\s*{')

    filecontents = []

    # remove trailing and excessive whitespace
    for line in filecontents_source:
        line = string.strip(line)
        line = space_rex.sub(' ', line)
        filecontents.append(' ' + line)
    filecontents = string.join(filecontents, '')

    # the file is in one long string
    filecontents = no_outer_parens(filecontents)
    # character encoding, reserved latex characters
    filecontents = re.sub('{\\\&}', '&', filecontents)
    filecontents = re.sub('\\\&', '&', filecontents)
    filecontents = filecontents.strip()
    #
    # Find entries
    #
    strings = []
    preamble = []
    comment = []
    entries = {}
    start = 0
    s = 0
    e = 0
    final = len(filecontents) - 1

    while start < final:
        entry = {}
        m = pub_rex.search(filecontents[start:])

        if m:
            start += m.start()
            arttype = string.lower(pub_rex.sub('\g<1>', m.group()))

            d = match_pair(filecontents[start:])
            if d:
                s, e = d

            s += start + 1
            e += (start - 1)
            # current has the currently analyzed entry
            current = filecontents[s:e]

            if arttype == 'string':
                name, defin = string.split(current, "=")
                defin = defin.replace('"', '').replace('  ', ' ')
                strings.append((name.strip(), defin.strip()))
            elif arttype == 'comment' or arttype == 'preamble':
                pass


#         print '# '+ arttype
            else:
                p = re.match('([^,]+),', current)
                artid = p.group()[:-1]
                entry['type'] = arttype
                entry['id'] = artid
                current = current[p.end():]
                ff = get_fields(current)
                for n, d in ff:
                    entry[n] = d

                entries[artid] = entry

            start = e
        else:
            return strings, entries

    return strings, entries
示例#8
0
def bibtexload(filecontents_source):
    space_rex = re.compile('\s+')
    pub_rex = re.compile('\W?@(\w*)\s*{')

    filecontents = []

    # remove trailing and excessive whitespace
    # ignore comments
    for line in filecontents_source:
        line = string.strip(line)
        line = space_rex.sub(' ', line)
        # ignore comments
        filecontents.append(' ' + line)
    filecontents = string.join(filecontents, '')

    # the file is in one long string
    filecontents = no_outer_parens(filecontents)
    # character encoding, reserved latex characters
    filecontents = re.sub('{\\\&}', '&', filecontents)
    filecontents = re.sub('\\\&', '&', filecontents)
    filecontents = filecontents.strip()
    #
    # Find entries
    #
    strings = []
    entries = {}
    s = 0
    e = 0
    start = 0
    final = len(filecontents) - 1

    while start < final:
        entry = {}
        m = pub_rex.search(filecontents[start:])

        if m:
            start += m.start()
            arttype = string.lower(pub_rex.sub('\g<1>', m.group()))

            d = match_pair(filecontents[start:])
            if d:
                s, e = d

            s += start + 1
            e += (start - 1)
            # current has the currently analyzed entry
            current = filecontents[s:e]

            if arttype == 'string':
                name, defin = string.split(current, "=")
                defin = defin.replace('"', '').replace('  ', ' ')
                strings.append((name.strip(), defin.strip()))
            elif arttype == 'comment' or arttype == 'preamble':
                pass
#         print '# '+ arttype
            else:
                p = re.match('([^,]+),', current)
                artid = p.group()[:-1]
                entry['type'] = arttype
                entry['id'] = artid
                current = current[p.end():]
                ff = get_fields(current)
                for n, d in ff:
                    entry[n] = d

                entries[artid] = entry

            start = e
        else:
            return strings, entries

    return strings, entries
示例#9
0
def authors(data):

    tokenized = []
    a = []
    sticky = (None, "")
    #determine the case of the word
    for i in re.finditer("(?P<caseless>[{\\\][^,\s]*)|(?P<separator>,)"
                         "|(?P<word>[^\s,]+)|(?P<space>\s)", data):

        if not sticky[0] and re.search("{", i.group(0)) \
                and not match_pair(i.group(0)):  # brace not closed?
            if i.group("caseless"):
                sticky = ("caseless", i.group(0))
            elif i.group("word"):
                sticky = ("word", i.group(0))
            continue
        elif sticky[0] and not match_pair(sticky[1] + i.group(0)):
            sticky = (sticky[0], sticky[1] + i.group(0))
            continue

        if sticky[0]:
            match = sticky[1] + i.group(0)
            token = sticky[0]
            sticky = (None, "")
        else:
            match = i.group(0)
            if i.group("caseless"):
                token = "caseless"
            if i.group("word"):
                token = "word"
            if i.group("separator"):
                a.append("separator")
                token = "separator"
            if i.group("space"):
                token = "space"

        if token == "caseless":
            m = (0, 0)
            caseless = match
            while m:
                m = match_pair(caseless)
                if m and m[0] == 0:
                    caseless = caseless[m[1]:]
                else:
                    break
            w = re.search("[\w]", caseless)
            if len(caseless) > 0 and w:
                if w.group(0).islower() or w.group(0).isdigit():
                    a.append(("lowercase", match))
                else:
                    a.append(("uppercase", match))
            else:
                a.append(("caseless", match))

        elif token == "word":
            if match == "and":
                tokenized.append(a)
                a = []
            elif match[0].islower() or match[0].isdigit():
                a.append(("lowercase", match))
            else:
                a.append(("uppercase", match))

    if sticky[0]:
        pass
        #raise Exception("Brace error!")

    tokenized.append(a)

    #determine the cite structure

    ret = []
    for author in tokenized:
        count = author.count("separator")
        a = {"first": "", "von": "", "last": "", "jr": ""}

        #First von Last
        if count == 0:
            index = 0

            #first
            for index, word in enumerate(author):
                if index + 1 < len(author) and word[0] != "lowercase":
                    a["first"] += " " + word[1]
                else:
                    author = author[index:]
                    break

            #von
            caseless = []
            for index, word in enumerate(author):
                if index + 1 < len(author) and word[0] != "uppercase":
                    if word[0] == "caseless":
                        caseless.append(word[1])
                    elif word[0] == "lowercase":
                        for w in caseless:
                            a["von"] += " " + w
                        caseless = []
                        a["von"] += " " + word[1]
                else:
                    author = author[index:]

            #last
            for word in caseless:
                a["last"] += " " + word
            for index, word in enumerate(author):
                a["last"] += " " + word[1]

        #von Last, [jr ,] First
        elif count > 0:

            #von
            upper = []
            for index, word in enumerate(author):
                if author[index + 1] == "separator":
                    upper.append(word[1])
                    author = author[index + 2:]
                    break
                if word == "uppercase":
                    upper.append(word)
                elif word != "separator":
                    for w in upper:
                        a["von"] += " " + w
                    upper = []
                    a["von"] += " " + word[1]
                else:
                    author = author[index + 1:]
                    break

            #last
            for word in upper:
                a["last"] += " " + word

            #jr
            if count > 1:
                for index, word in enumerate(author):
                    if word != "separator":
                        a["jr"] += " " + word[1]
                    else:
                        author = author[index + 1:]
                        break

            #first
            for index, word in enumerate(author):
                if word != "separator":
                    a["first"] += " " + word[1]
                else:
                    a["first"] += ","

        elif count > 1:
            pass

        b = {}
        for k in a:
            if len(a[k]) > 0:
                b[k] = a[k]
                b[k] = b[k].lstrip()

        ret.append(b)

    return ret
示例#10
0
def authors(data):

    tokenized = []
    a = []
    sticky = (None, "")
    #determine the case of the word
    for i in re.finditer(
            "(?P<caseless>[{\\\][^,\s]*)|(?P<separator>,)"
            "|(?P<word>[^\s,]+)|(?P<space>\s)", data):

        if not sticky[0] and re.search("{", i.group(0)) \
                and not match_pair(i.group(0)):  # brace not closed?
            if i.group("caseless"):
                sticky = ("caseless", i.group(0))
            elif i.group("word"):
                sticky = ("word", i.group(0))
            continue
        elif sticky[0] and not match_pair(sticky[1] + i.group(0)):
            sticky = (sticky[0], sticky[1] + i.group(0))
            continue

        if sticky[0]:
            match = sticky[1] + i.group(0)
            token = sticky[0]
            sticky = (None, "")
        else:
            match = i.group(0)
            if i.group("caseless"):
                token = "caseless"
            if i.group("word"):
                token = "word"
            if i.group("separator"):
                a.append("separator")
                token = "separator"
            if i.group("space"):
                token = "space"

        if token == "caseless":
            m = (0, 0)
            caseless = match
            while m:
                m = match_pair(caseless)
                if m and m[0] == 0:
                    caseless = caseless[m[1]:]
                else:
                    break
            w = re.search("[\w]", caseless)
            if len(caseless) > 0 and w:
                if w.group(0).islower() or w.group(0).isdigit():
                    a.append(("lowercase", match))
                else:
                    a.append(("uppercase", match))
            else:
                a.append(("caseless", match))

        elif token == "word":
            if match == "and":
                tokenized.append(a)
                a = []
            elif match[0].islower() or match[0].isdigit():
                a.append(("lowercase", match))
            else:
                a.append(("uppercase", match))

    if sticky[0]:
        pass
        #raise Exception("Brace error!")

    tokenized.append(a)

    #determine the cite structure

    ret = []
    for author in tokenized:
        count = author.count("separator")
        a = {"first": "", "von": "", "last": "", "jr": ""}

        #First von Last
        if count == 0:
            index = 0

            #first
            for index, word in enumerate(author):
                if index + 1 < len(author) and word[0] != "lowercase":
                    a["first"] += " " + word[1]
                else:
                    author = author[index:]
                    break

            #von
            caseless = []
            for index, word in enumerate(author):
                if index + 1 < len(author) and word[0] != "uppercase":
                    if word[0] == "caseless":
                        caseless.append(word[1])
                    elif word[0] == "lowercase":
                        for w in caseless:
                            a["von"] += " " + w
                        caseless = []
                        a["von"] += " " + word[1]
                else:
                    author = author[index:]

            #last
            for word in caseless:
                a["last"] += " " + word
            for index, word in enumerate(author):
                a["last"] += " " + word[1]

        #von Last, [jr ,] First
        elif count > 0:

            #von
            upper = []
            for index, word in enumerate(author):
                if author[index + 1] == "separator":
                    upper.append(word[1])
                    author = author[index + 2:]
                    break
                if word == "uppercase":
                    upper.append(word)
                elif word != "separator":
                    for w in upper:
                        a["von"] += " " + w
                    upper = []
                    a["von"] += " " + word[1]
                else:
                    author = author[index + 1:]
                    break

            #last
            for word in upper:
                a["last"] += " " + word

            #jr
            if count > 1:
                for index, word in enumerate(author):
                    if word != "separator":
                        a["jr"] += " " + word[1]
                    else:
                        author = author[index + 1:]
                        break

            #first
            for index, word in enumerate(author):
                if word != "separator":
                    a["first"] += " " + word[1]
                else:
                    a["first"] += ","

        elif count > 1:
            pass

        b = {}
        for k in a:
            if len(a[k]) > 0:
                b[k] = a[k]
                b[k] = b[k].lstrip()

        ret.append(b)

    return ret