def is_valid_bibtex(reference): """ Use pybtex to validate that a reference is in proper BibTeX format Args: reference: A String reference in BibTeX format. Returns: Boolean indicating if reference is valid bibtex. """ # str is necessary since pybtex seems to have an issue with unicode. The # filter expression removes all non-ASCII characters. sio = io.StringIO(remove_non_ascii(reference)) parser = bibtex.Parser() bib_data = parser.parse_stream(sio) return len(bib_data.entries) > 0
def is_valid_bibtex(reference): """ Use pybtex to validate that a reference is in proper BibTeX format Args: reference: A String reference in BibTeX format. Returns: Boolean indicating if reference is valid bibtex. """ # str is necessary since pybtex seems to have an issue with unicode. The # filter expression removes all non-ASCII characters. sio = cStringIO.StringIO(remove_non_ascii(reference)) parser = bibtex.Parser() bib_data = parser.parse_stream(sio) return len(bib_data.entries) > 0
def _clean_cif(s): """ Removes non-ASCII and some unsupported _cgraph fields from the cif string """ clean = [] lines = s.split("\n") skip = False while len(lines) > 0: l = lines.pop(0) if skip: if l.strip().startswith("_") or l.strip() == "loop_": skip = False else: continue if l.strip().startswith("_cgraph"): skip = True elif not l.strip().startswith("_eof"): clean.append(remove_non_ascii(l)) return "\n".join(clean)
def _process_string(cls, string): # remove comments string = re.sub("(\s|^)#.*$", "", string, flags=re.MULTILINE) # remove empty lines string = re.sub("^\s*\n", "", string, flags=re.MULTILINE) # remove non_ascii string = remove_non_ascii(string) # since line breaks in .cif files are mostly meaningless, # break up into a stream of tokens to parse, rejoining multiline # strings (between semicolons) q = deque() multiline = False ml = [] # this regex splits on spaces, except when in quotes. # starting quotes must not be preceded by non-whitespace # (these get eaten by the first expression) # ending quotes must not be followed by non-whitespace p = re.compile(r'''([^'"\s][\S]*)|'(.*?)'(?!\S)|"(.*?)"(?!\S)''') for l in string.splitlines(): if multiline: if l.startswith(";"): multiline = False q.append(('', '', '', ' '.join(ml))) ml = [] l = l[1:].strip() else: ml.append(l) continue if l.startswith(";"): multiline = True ml.append(l[1:].strip()) else: for s in p.findall(l): # s is tuple. location of the data in the tuple # depends on whether it was quoted in the input q.append(s) return q
def _process_string(cls, string): #remove comments string = re.sub("(\s|^)#.*$", "", string, flags=re.MULTILINE) #remove empty lines string = re.sub("^\s*\n", "", string, flags=re.MULTILINE) #remove non_ascii string = remove_non_ascii(string) #since line breaks in .cif files are mostly meaningless, #break up into a stream of tokens to parse, rejoining multiline #strings (between semicolons) q = deque() multiline = False ml = [] # this regex splits on spaces, except when in quotes. # starting quotes must not be preceded by non-whitespace # (these get eaten by the first expression) # ending quotes must not be followed by non-whitespace p = re.compile(r'''([^'"\s][\S]*)|'(.*?)'(?!\S)|"(.*?)"(?!\S)''') for l in string.splitlines(): if multiline: if l.startswith(";"): multiline = False q.append(('', '', '', ' '.join(ml))) ml = [] l = l[1:].strip() else: ml.append(l) continue if l.startswith(";"): multiline = True ml.append(l[1:].strip()) else: for s in p.findall(l): # s is tuple. location of the data in the tuple # depends on whether it was quoted in the input q.append(s) return q
def _process_string(cls, string): #remove comments string = re.sub("#.*", "", string) #remove empty lines string = re.sub("^\s*\n", "", string, flags=re.MULTILINE) #remove whitespaces at beginning of lines string = re.sub("^\s*", "", string, flags=re.MULTILINE) #remove non_ascii string = remove_non_ascii(string) #since line breaks in .cif files are mostly meaningless, #break up into a stream of tokens to parse, rejoining multiline #strings (between semicolons) q = deque() multiline = False ml = [] #this regex splits on spaces, except when in quotes. #it also ignores single quotes when surrounded by non-whitespace #since they are sometimes used in author names p = re.compile(r'''([^'"\s]+)|'((?:\S'\S|[^'])*)'|"([^"]*)"''') for l in string.splitlines(): if multiline: if l.startswith(";"): multiline = False q.append(" ".join(ml)) ml = [] l = l[1:].strip() else: ml.append(l) continue if l.startswith(";"): multiline = True ml.append(l[1:].strip()) else: for s in p.findall(l): q.append(''.join(s)) return q
def test_remove_non_ascii(self): s = "".join(chr(random.randint(0, 127)) for i in range(10)) s += "".join(chr(random.randint(128, 150)) for i in range(10)) clean = remove_non_ascii(s) self.assertEqual(len(clean), 10)