def test_latex_to_unicode(self): """textutils - latex_to_unicode""" self.assertEqual( translate_latex2unicode("\\'a \\'i \\'U").encode('utf-8'), "á í Ú") self.assertEqual(translate_latex2unicode("\\'N \\k{i}"), u'\u0143 \u012f') self.assertEqual(translate_latex2unicode("\\AAkeson"), u'\u212bkeson') self.assertEqual(translate_latex2unicode("$\\mathsl{\\Zeta}$"), u'\U0001d6e7')
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'): """ Given a record and field tag, this function will modify the record by translating the subfield values of found fields from LaTeX to chosen encoding for all the subfields with given code (or all if no code is given). @param record: record to modify, in BibRec style structure @type record: dict @param tag: tag of fields to modify @type tag: string @param code: restrict the translation to a given subfield code @type code: string @param encoding: scharacter encoding for the new value. Defaults to UTF-8. @type encoding: string """ field_list = record_get_field_instances(record, tag) for field in field_list: subfields = field[0] subfield_index = 0 for subfield_code, subfield_value in subfields: if code == '' or subfield_code == code: newvalue = translate_latex2unicode(subfield_value).encode(encoding) record_modify_subfield(record, tag, subfield_code, newvalue, \ subfield_index, field_position_global=field[4]) subfield_index += 1
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'): """ Given a record and field tag, this function will modify the record by translating the subfield values of found fields from LaTeX to chosen encoding for all the subfields with given code (or all if no code is given). @param record: record to modify, in BibRec style structure @type record: dict @param tag: tag of fields to modify @type tag: string @param code: restrict the translation to a given subfield code @type code: string @param encoding: scharacter encoding for the new value. Defaults to UTF-8. @type encoding: string """ field_list = record_get_field_instances(record, tag) for field in field_list: subfields = field[0] subfield_index = 0 for subfield_code, subfield_value in subfields: if code == '' or subfield_code == code: newvalue = translate_latex2unicode(subfield_value).encode( encoding) record_modify_subfield(record, tag, subfield_code, newvalue, \ subfield_index, field_position_global=field[4]) subfield_index += 1
def process_author_name(author): """Convert author to INSPIRE form.""" #test for ALLCAPS author = author.replace('YYYY', '') if re.search(r'[A-Z][A-Z]', author): author_uplow = '' for part in author.split(' '): if part.upper() == part and not re.match(r'I[IV]+', part): part = part.title() author_uplow += ' ' + part author = author_uplow author = author.replace('Inspire', 'INSPIRE') #print 'INPUT = ', author author = author.replace(r'\.', r'xxxx') author = author.replace(r'.', '. ') author = author.replace(r'xxxx', r'\.') author = re.sub('[ ]+', ' ', author) author = re.sub(r'\\(cor|corauth|fn)ref\{\w+\}', r'', author) author = re.sub(r'\}?\\thanks\{\\?.*\}?', r'', author) author = author.replace(r'\,', r'~') author = author.replace(r'\~', r'xxxx') author = author.replace(r'~', r' ') author = author.replace(r'xxxx', r'\~') #print 'MIDWAY1 =', author author = translate_latex2unicode(author) if '\\' in author: print 'Problem with', author author = author.replace(',', ', ') author = author.replace('.', '. ') author = re.sub(r'\s+', ' ', author) author = re.sub(r'\s+$', '', author) author = re.sub(r'^\s+', '', author) #print 'MIDWAY2 =', author match_object_1 = re.match(r'^(.*\w) ([IVJr\.]{2,}$)', author) match_object_2 = re.match(ur'(.*) (\(.*\))', author) if match_object_1 or match_object_2: if match_object_1: match_object = match_object_1 elif match_object_2: match_object = match_object_2 author = author_first_last(match_object.group(1)) + ', ' + \ match_object.group(2) else: author = author_first_last(author) author = author.replace(',', ', ') author = re.sub(r'\.\s+', '.', author) author = re.sub(r'\s+', ' ', author) author = re.sub(r'\s+$', '', author) author = re.sub(r'^\s+', '', author) #author = translate_latex2unicode(author) #print 'OUTPUT =', author return author
def get_captions(list_latex, list_pdf): """ Function that takes all the caption from latex and pdf and creates a list for each case. Also it transforms all the latex simbols in unicode, removes any spaces or endline and removes Fig.<number>. Those transforms are used in the next step of matching captions. The original caption is obviously saved and put into MARCXML later. @param list_latex @param list_pdf @return caption_list_latex, caption_list_pdf, update_list_latex, update_list_pdf """ caption_list_latex = [figure.caption for figure in list_latex] caption_list_pdf = [figure.caption for figure in list_pdf] update_list_latex = [figure for figure in list_latex] update_list_pdf = [figure for figure in list_pdf] for index, caption in enumerate(caption_list_latex): # transform all latex simbols in unicode caption = translate_latex2unicode(caption) # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space caption = caption.lstrip().replace('\n', ' ') update_list_latex[index].caption = caption # skip the figure number #ur UNICODE RAW caption = re.sub(ur'^[\d]+ ', '', caption) # ignore unicode chars caption = caption.encode('ascii', 'ignore') # ignore some \\ fields caption = re.sub(r'\\[a-z]+', '', caption) # delete special chars for char in caption: if char in ' _^${}[]': caption = caption.replace(char, '') # convert again to unicode unicode(caption) caption_list_latex[index] = caption # Transformations (special characters and newlines) for index, caption in enumerate(caption_list_pdf): # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space caption = caption.lstrip().replace('\n', ' ') caption = strip_control_characters(caption) update_list_pdf[index].caption = caption # Skip Fig. number. at the beginning #ur UNICODE RAW caption = re.sub(ur'^Fig\. [\d]+\. ', '', caption) caption = re.sub(ur'^Figure [\d]+\: ', '', caption) #encode to ascii for eliminating unicode chars caption = caption.encode('ascii', 'ignore') caption = caption.replace(' ', '') #reconvert to unicode unicode(caption) caption_list_pdf[index] = caption return (caption_list_latex, caption_list_pdf, update_list_latex, update_list_pdf)
def process_author_name(author): """Convert author to INSPIRE form.""" #test for ALLCAPS author = author.replace('YYYY', '') if re.search(r'[A-Z][A-Z]', author): author_uplow = '' for part in author.split(' '): if part.upper() == part and not re.match(r'I[IV]+', part): part = part.title() author_uplow += ' ' + part author = author_uplow author = author.replace('Inspire', 'INSPIRE') #print 'INPUT = ', author author = author.replace(r'\.', r'xxxx') author = author.replace(r'.', '. ') author = author.replace(r'xxxx', r'\.') author = re.sub('[ ]+', ' ', author) author = re.sub(r'\\(cor|corauth|fn)ref\{\w+\}', r'', author) author = re.sub(r'\}?\\thanks\{\\?.*\}?', r'', author) author = author.replace(r'\,', r'~') author = author.replace(r'\~', r'xxxx') author = author.replace(r'~', r' ') author = author.replace(r'xxxx', r'\~') #print 'MIDWAY1 =', author author = translate_latex2unicode(author) author = author.replace('\\"i', 'ï') if '\\' in author and not 'UTF8' in author: print 'Problem with', author sys.exit() author = author.replace(',', ', ') author = author.replace('.', '. ') author = re.sub(r'\s+', ' ', author) author = re.sub(r'\s+$', '', author) author = re.sub(r'^\s+', '', author) #print 'MIDWAY2 =', author match_object_1 = re.match(r'^(.*\w) ([IVJr\.]{2,}$)', author) match_object_2 = re.match(ur'(.*) (\(.*\))', author) if match_object_1 or match_object_2: if match_object_1: match_object = match_object_1 elif match_object_2: match_object = match_object_2 author = author_first_last(match_object.group(1)) + ', ' + \ match_object.group(2) else: author = author_first_last(author) author = author.replace(',', ', ') author = re.sub(r'\.\s+', '.', author) author = re.sub(r'\s+', ' ', author) author = re.sub(r'\s+$', '', author) author = re.sub(r'^\s+', '', author) #author = translate_latex2unicode(author) #print 'OUTPUT =', author return author
def test_latex_to_unicode(self): """textutils - latex_to_unicode""" self.assertEqual(translate_latex2unicode("\\'a \\'i \\'U").encode('utf-8'), "á í Ú") self.assertEqual(translate_latex2unicode("\\'N \\k{i}"), u'\u0143 \u012f') self.assertEqual(translate_latex2unicode("\\AAkeson"), u'\u212bkeson') self.assertEqual(translate_latex2unicode("$\\mathsl{\\Zeta}$"), u'\U0001d6e7')
def create_xml(eprint=None, doi=None, author_dict=None): """Take in the author dictionary and write it out as xml.""" if eprint: search = 'find eprint ' + eprint + ' or recid ' + eprint if '/' in eprint or '.' in eprint: search = 'find eprint ' + eprint recid = perform_request_search(p=search, cc='HEP') or \ perform_request_search(p=search, cc='Fermilab') try: recid = recid[0] except IndexError: print 'Do not have eprint or recid', search return None elif doi: try: search = 'find doi ' + doi recid = perform_request_search(p=search, cc='HEP')[0] except IndexError: print 'Do not have doi', search return None record = {} record_add_field(record, '001', controlfield_value=str(recid)) tag = '100__' #EMAIL_REGEX = re.compile(r"^[\w\-\.\'\+]+@[\w\-\.]+\.\w{2,4}$") #ORCID_REGEX = re.compile(r'^0000-\d{4}-\d{4}-\d{3}[\dX]$') #INSPIRE_REGEX = re.compile(r'^INSPIRE-\d{8}$') for key in author_dict: subfields = [] author = author_dict[key][0] #print author_dict match_obj = re.search(ORCID_REGEX, author) if match_obj: orcid = match_obj.group(1) if not re.match(ORCID_REGEX, orcid): print '1 Problem with', orcid if ('j', 'ORCID:' + orcid) not in subfields: subfields.append(('j', 'ORCID:' + orcid)) author = author.replace(orcid, '') author = author.replace('[]', '') match_obj = re.search(r'(INSPIRE-\d{8})', author) if match_obj: inspire = match_obj.group(1) if not re.match(INSPIRE_REGEX, inspire): print 'Problem with', inspire subfields.append(('i', inspire)) author = author.replace(inspire, '') author = author.replace('[]', '') if u':' in author: match_obj = re.match(u'(.*):(.*)', author) author = match_obj.group(1) subfields.append(('q', match_obj.group(2))) subfields.append(('a', author)) for affiliation in author_dict[key][1]: affiliation = re.sub(r'\\affinfn{(.*)}{(.*)}', r'INFN \1 \2', affiliation) affiliation = re.sub(r'\\affuni{(.*)}{(.*)}', r'\1 University \2', affiliation) affiliation = translate_latex2unicode(affiliation) #affiliation = re.sub(r'(\w)\W*$', r'\1', affiliation) affiliation = re.sub(r'([\.\,]+)', r'\1 ', affiliation) affiliation = re.sub(r'\s+', ' ', affiliation) affiliation = re.sub(r'\s$', r'', affiliation) affiliation = re.sub(r'\s*also at[\:\s]*', r'', affiliation, re.IGNORECASE) affiliation = re.sub(r'\s*\\and$', r'', affiliation) if r"@" in affiliation and r"0000-" in affiliation: affiliation = affiliation.replace(';', ' ') affiliation = affiliation.replace(r'. ', r'.') email = re.search(r"(\S+\@\S+)", affiliation).group(1) orcid = re.search(r"(0000-\S+)", affiliation).group(1) if re.match(EMAIL_REGEX, email): subfields.append(('m', 'email:' + email)) else: print "Email problem:", email if re.match(ORCID_REGEX, orcid) and \ ('j', 'ORCID:' + orcid) not in subfields: subfields.append(('j', 'ORCID:' + orcid)) else: print "ORCID problem:", orcid continue elif r"@" in affiliation: affiliation = affiliation.replace(r'. ', r'.') subfields.append(('m', affiliation)) continue #elif re.match(r"^0000-0", affiliation): elif re.search(r"0000-0", affiliation): #print 'XXX', affiliation for aff in affiliation.split(): aff = re.sub(r'[^\d^\-^X]', '', aff) orcid = re.search(ORCID_REGEX, aff) if orcid: orcid = orcid.group(0) if ('j', 'ORCID:' + orcid) not in subfields: subfields.append(('j', 'ORCID:' + orcid)) affiliation = re.sub(orcid, '', affiliation) affiliation = re.sub(r'\s+$', '', affiliation) #print 'YYY', affiliation #subfields.append(('v', affiliation)) #break if not orcid: print "ORCID problem:", affiliation #Removed this to process aff that contained ORCID #continue elif re.match(r"^INSPIRE-", affiliation): subfields.append(('i', affiliation)) #Removed this to process aff that contained ORCID #continue affiliation = affiliation.replace('[]', '') if not affiliation: continue affiliation_key = re.sub(r'\W+', ' ', affiliation).upper() affiliation_key = re.sub(r'\s*(.+\S)\s*', r'\1', affiliation_key) try: for inst in AFFILIATIONS_DONE[affiliation_key]: inst = re.sub(r'^\s+', '', inst) if inst: subfields.append(('u', inst)) except KeyError: if False: print "AFF in: ", affiliation, "*" time1 = time.time() inspire_affiliation = get_aff(unidecode(affiliation)) if False: time2 = time.time() time_taken = time2 - time1 print "AFF out:", inspire_affiliation, \ "Time taken", time_taken for inst in inspire_affiliation: inst = re.sub(r'^\s+', '', inst) if inst: subfields.append(('u', inst)) if not TEST: AFFILIATIONS_DONE[affiliation_key] = inspire_affiliation if affiliation: subfields.append(('v', affiliation)) record_add_field(record, tag[0:3], tag[3], tag[4], \ subfields=subfields) tag = '700__' return print_rec(record)
def create_xml(eprint=None, doi=None, author_dict=None): """Take in the author dictionary and write it out as xml.""" if eprint: search = 'find eprint ' + eprint + ' or recid ' + eprint if '/' in eprint or '.' in eprint: search = 'find eprint ' + eprint recid = perform_request_search(p=search, cc='HEP') or \ perform_request_search(p=search, cc='Fermilab') try: recid = recid[0] except IndexError: print 'Do not have eprint or recid', search return None elif doi: try: search = 'find doi ' + doi recid = perform_request_search(p=search, cc='HEP')[0] except IndexError: print 'Do not have doi', search return None record = {} record_add_field(record, '001', controlfield_value=str(recid)) tag = '100__' email_regex = re.compile(r"^[\w\-\.\'\+]+@[\w\-\.]+\.\w{2,4}$") orcid_regex = re.compile(r'^0000-\d{4}-\d{4}-\d{3}[\dX]$') inspire_regex = re.compile(r'^INSPIRE-\d{8}$') for key in author_dict: subfields = [] author = author_dict[key][0] match_obj = re.search(r'(0000-\d{4}-\d{4}-\d{3}[\dX])', author) if match_obj: orcid = match_obj.group(1) if not re.match(orcid_regex, orcid): print 'Problem with', orcid subfields.append(('j', 'ORCID:' + orcid)) author = author.replace(orcid, '') author = author.replace('[]', '') match_obj = re.search(r'(INSPIRE-\d{8})', author) if match_obj: inspire = match_obj.group(1) if not re.match(inspire_regex, inspire): print 'Problem with', inspire subfields.append(('i', inspire)) author = author.replace(inspire, '') author = author.replace('[]', '') subfields.append(('a', author)) for affiliation in author_dict[key][1]: affiliation = re.sub(r'\\affinfn{(.*)}{(.*)}', r'INFN \1 \2', affiliation) affiliation = re.sub(r'\\affuni{(.*)}{(.*)}', r'\1 University \2', affiliation) affiliation = translate_latex2unicode(affiliation) #affiliation = re.sub(r'(\w)\W*$', r'\1', affiliation) affiliation = re.sub(r'([\.\,]+)', r'\1 ', affiliation) affiliation = re.sub(r'\s+', ' ', affiliation) affiliation = re.sub(r'\s$', r'', affiliation) affiliation = re.sub(r'\s*also at[\:\s]*', r'', affiliation, re.IGNORECASE) affiliation = re.sub(r'\s*\\and$', r'', affiliation) if r"@" in affiliation and r"0000-" in affiliation: affiliation = affiliation.replace(';', ' ') affiliation = affiliation.replace(r'. ', r'.') email = re.search(r"(\S+\@\S+)", affiliation).group(1) orcid = re.search(r"(0000-\S+)", affiliation).group(1) if re.match(email_regex, email): subfields.append(('m', 'email:' + email)) else: print "Email problem:", email if re.match(orcid_regex, orcid): subfields.append(('j', 'ORCID:' + orcid)) else: print "ORCID problem:", orcid continue elif r"@" in affiliation: affiliation = affiliation.replace(r'. ', r'.') subfields.append(('m', affiliation)) continue elif re.match(r"^0000-0", affiliation): try: orcid = re.search(r'(0000-\d{4}-\d{4}-\d{3}[\dX])', affiliation).group(1) subfields.append(('j', 'ORCID:' + orcid)) except AttributeError: print "ORCID problem:", affiliation continue elif re.match(r"^INSPIRE-", affiliation): subfields.append(('i', affiliation)) continue affiliation_key = re.sub(r'\W+', ' ', affiliation).upper() try: for inst in AFFILIATIONS_DONE[affiliation_key]: inst = re.sub(r'^\s+', '', inst) subfields.append(('u', inst)) except KeyError: if False: print "AFF in: ", affiliation, "*" time1 = time.time() inspire_affiliation = get_aff(unidecode(affiliation)) if False: time2 = time.time() time_taken = time2 - time1 print "AFF out:", inspire_affiliation, \ "Time taken", time_taken for inst in inspire_affiliation: inst = re.sub(r'^\s+', '', inst) subfields.append(('u', inst)) if not TEST: AFFILIATIONS_DONE[affiliation_key] = inspire_affiliation subfields.append(('v', affiliation)) record_add_field(record, tag[0:3], tag[3], tag[4], \ subfields=subfields) tag = '700__' return print_rec(record)
def similarity_between_caption1_and_caption2(caption1, caption2, list1, list2): """ Function that takes two lists and two captions and returns if the caption 1 is matching caption 2 @param caption1: caption from pdf source @param caption2: caption from latex source @param list1: list of pdf figures @param list2: list of latex figures @return: 0 if caption1 matches caption2, -1 else """ caption_list1 = [element.caption for element in list1] caption_list2 = [element.caption for element in list2] # Transformations (special characters and newlines) for index, caption in enumerate(caption_list1): # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space caption = caption.lstrip().replace('\n', ' ') # when encounter at the beginning the regular expresion: Fig. number. => skip #ur UNICODE RAW caption = re.sub(ur'^Fig\. [\d]+\. ', '', caption) caption_list1[index] = caption print caption_list1 for index, caption in enumerate(caption_list2): # transform all latex simbols in unicode caption = translate_latex2unicode(caption) # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space caption = caption.lstrip().replace('\n', ' ') # when encounter at the beginning the regular expresion: number => skip #ur UNICODE RAW caption = re.sub(ur'^[\d]+ ', '', caption) caption_list2[index] = caption print caption_list2 # Long common subsequence # Levenshtein distance dictionary = {} dictionary2 = {} dictionary3 = {} for i in range(len(caption_list1)): distances = [] lcss = [] for j in range(len(caption_list2)): distance = levenshtein(caption_list1[i], caption_list2[j]) distances.append(distance) X = caption_list1[i] Y = caption_list2[j] m = len(X) n = len(Y) C = LCS(X, Y) lcs = backTrack(C, X, Y, m, n) lcss.append(lcs) print distances max_distance = 0 index_max_elem_list = [] for k in range(len(lcss)): if len(lcss[k]) > max_distance: max_distance = len(lcss[k]) index_max = k index_max_elem_list.append(index_max) # if there are equal distances n = 0 for k in range(len(lcss)): if len(lcss[k]) == max_distance: if (n != 0): index_max_elem_list.append(k) n = 1 min_distance = 100000 index_min_elem_list = [] index_min_direct_comparison = [] for k in range(len(distances)): if distances[k] == 0: index_min_direct_comparison.append(k) if distances[k] < min_distance: min_distance = distances[k] index_min = k index_min_elem_list.append(index_min) # if there are equal distances n = 0 for k in range(len(distances)): if distances[k] == min_distance: if (n != 0): index_min_elem_list.append(k) n = 1 dictionary[i] = index_min_elem_list print dictionary dictionary2[i] = index_max_elem_list print dictionary2 dictionary3[i] = index_min_direct_comparison print dictionary3 for caption in caption_list1: if caption == caption1: index_caption1 = caption_list1.index(caption) for caption in caption_list2: if caption == caption2: index_caption2 = caption_list2.index(caption) if index_caption2 in dictionary[index_caption1]: if index_caption2 in dictionary2[index_caption1]: # if index_caption2 in dictionary3[index_caption1] return 0 return -1
def similarity(list_latex, list_pdf): """ The function that takes two lists of figures and detects the matches between them @param list_latex: the list of latex figures @param list_pdf: the list of pdf figures @return: the matching tuples """ caption_list_latex = [figure.caption for figure in list_latex] caption_list_pdf = [figure.caption for figure in list_pdf] for index, caption in enumerate(caption_list_latex): # transform all latex simbols in unicode caption = translate_latex2unicode(caption) # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space caption = caption.lstrip().replace('\n', ' ') list_latex[index].caption = caption # when encounter at the beginning the regular expresion: number => skip #ur UNICODE RAW caption = re.sub(ur'^[\d]+ ', '', caption) caption_list_latex[index] = caption # Transformations (special characters and newlines) for index, caption in enumerate(caption_list_pdf): # remove all spaces and new lines and tabs from the beginning, then if a line contains \n, replace with space caption = caption.lstrip().replace('\n', ' ') caption = strip_control_characters(caption) list_pdf[index].caption = caption # when encounter at the beginning the regular expresion: Fig. number. => skip #ur UNICODE RAW caption = re.sub(ur'^Fig\. [\d]+\. ', '', caption) caption = re.sub(ur'^Figure [\d]+\: ', '', caption) # used for levenshtein distance dictionary = {} # used for longest common subsequence dictionary2 = {} # used for direct comparison dictionary3 = {} for i in range(len(caption_list_latex)): distances = [] lcss = [] for j in range(len(caption_list_pdf)): distance = levenshtein(caption_list_latex[i], caption_list_pdf[j]) distances.append(distance) X = caption_list_latex[i] Y = caption_list_pdf[j] m = len(X) n = len(Y) C = LCS(X, Y) lcs = iterative(C, X, Y, m, n) lcss.append(lcs) max_distance = 0 index_max = 0 # the list we use in representing the longest common subsequence index_max_elem_list = [] for k in range(len(lcss)): if len(lcss[k]) > max_distance: max_distance = len(lcss[k]) index_max = k index_max_elem_list.append(index_max) # if there are equal distances n = 0 for k in range(len(lcss)): if len(lcss[k]) == max_distance: if (n != 0): index_max_elem_list.append(k) n = 1 min_distance = 100000 # the list we use in representing the levenshtein distance index_min_elem_list = [] # the list for direct comparison index_min_direct_comparison = [] for k in range(len(distances)): if distances[k] == 0: index_min_direct_comparison.append(k) if distances[k] < min_distance: min_distance = distances[k] index_min = k if len(distances) != 0: index_min_elem_list.append(index_min) # if there are equal distances n = 0 for k in range(len(distances)): if distances[k] == min_distance: if (n != 0): index_min_elem_list.append(k) n = 1 dictionary[i] = index_min_elem_list print dictionary dictionary2[i] = index_max_elem_list print dictionary2 dictionary3[i] = index_min_direct_comparison print dictionary3 tuples = [] for i in range(len(dictionary)): for j in range(len(dictionary.values()[i])): if (dictionary.values()[i] == dictionary2.values()[i]): # if dictionary.values()[i] == dictionary3.values()[i] a_tuple = i, dictionary.values()[i][j] tuples.append(a_tuple) return tuples