def get_reference_number(tarball, refno_url): """ Attempts to determine the reference number of the file by searching. @param: tarball (string): the name of the tarball as downloaded from arXiv @param: refno_url (string): url of repository to check for a reference number for this record. If not set; returns None @return: refno (string): the reference number of the paper """ if refno_url: server = InvenioConnector(refno_url) # we just need the name of the file tarball = os.path.split(tarball)[1] prefix = '037__a:' # the name right now looks like arXiv:hep-ph_9703009 # or arXiv:0910.0476 if tarball.startswith(ARXIV_HEADER): if len(tarball.split('_')) > 1: tarball = tarball.split(':')[1] arXiv_record = tarball.replace('_', '/') else: arXiv_record = tarball result = server.search(p=prefix + arXiv_record, of='id') if len(result) == 0: return None return str(result[0]) arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', tarball) if len(arXiv_record) > 1: arXiv_record = arXiv_record[0] result = server.search(p=prefix + arXiv_record, of='id') if len(result) > 0: return str(result[0]) tarball_mod = tarball.replace('_', '/') arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', \ tarball_mod) if len(arXiv_record) > 1: arXiv_record = arXiv_record[0] result = server.search(p=prefix + arXiv_record, of='id') if len(result) > 0: return str(result[0]) return None
def main(): usage = """ save to file: python fix_marc_record.py marc_file.xml >> result_file.xml print to terminal: python fix_marc_record.py marc_file.xml options: --recid -r fix the record with the given record id from https://inspireheptest.cern.ch e.g. python fix_marc_record.py --recid=1291107 --site -s specify a different site useful only when option --recid or -r enabled e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net """ try: opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="]) options = map(lambda a: a[0], opts) if len(args) > 1: raise getopt.GetoptError("Too many arguments given!!!") elif not args and not ('-r' in options or '--recid' in options): raise getopt.GetoptError("Missing argument record to fix") except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" print(usage) sys.exit(2) if '-r' in options or '--recid' in options: from invenio.invenio_connector import InvenioConnector from xml.dom.minidom import parseString site = "http://inspireheptest.cern.ch/" for o, a in opts: if o in ['-s', '--site']: site = a if o in ['-r', '--recid']: recid = a inspiretest = InvenioConnector(site) record = inspiretest.search(p='001:%s' % recid, of='xm') marcxml = parseString(record) try: marcxml = marcxml.getElementsByTagName('record')[0] except IndexError: print("Record not found") sys.exit(2) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) print(marcxml.toxml()) else: filename = args[0] marcxml = parse(filename) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) print(marcxml.toxml())
def test_search_local_restricted_collections(self): """InvenioConnector - local restricted collection search""" server = InvenioConnector(CFG_SITE_URL) search_params = dict(p="LBL-28106", c=["Theses"], of="id") self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) server = InvenioConnector(CFG_SITE_SECURE_URL, user="******", password="") result = server.search(p="LBL-28106", c=["Theses"], of="id") self.assertTrue(len(result) > 0, "did not get restricted collection search results.")
def test_search_remote_restricted_collections(self): """InvenioConnector - remote restricted collection search""" server = InvenioConnector("http://invenio-demo.cern.ch") search_params = dict(p="LBL-28106", c=["Theses"], of="id") self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) server = InvenioConnector("https://invenio-demo.cern.ch", user="******", password="******") result = server.search(p="LBL-28106", c=["Theses"], of="id") self.assertTrue(len(result) > 0, "did not get restricted collection search results.")
def test_search_local_restricted_collections(self): """InvenioConnector - local restricted collection search""" server = InvenioConnector(CFG_SITE_URL) search_params = dict(p='LBL-28106', c=['Theses'], of='id') self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) server = InvenioConnector(CFG_SITE_SECURE_URL, user='******', password='') result = server.search(p='LBL-28106', c=['Theses'], of='id') self.assertTrue(len(result) > 0, \ 'did not get restricted collection search results.')
def test_search_remote_restricted_collections(self): """InvenioConnector - remote restricted collection search""" server = InvenioConnector("http://invenio-demo.cern.ch") search_params = dict(p='LBL-28106', c=['Theses'], of='id') self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) server = InvenioConnector("https://invenio-demo.cern.ch", user='******', password='******') result = server.search(p='LBL-28106', c=['Theses'], of='id') self.assertTrue(len(result) > 0, \ 'did not get restricted collection search results.')
def get_remote_ids(search_terms, collection=''): """ Retreives IDs from the remote instance of records which have a corresponding ID in the 035 field to the local instance. Parameters: (string) search_terms - what to search for remotely Returns: A list of RecIDs """ remote_connector = InvenioConnector(REMOTE_URL) _print("Getting records from: %s" % REMOTE_URL) recids = remote_connector.search(p=search_terms, cc=collection, of='id') _print("Found %d records on %s for search terms '%s' in collection '%s'" % (len(recids), REMOTE_INSTANCE, search_terms, collection)) return recids
def get_remote_ids(search_terms, collection=''): """ Retreives IDs from the remote instance of records which have a corresponding ID in the 035 field to the local instance. Parameters: (string) search_terms - what to search for remotely Returns: A list of RecIDs """ remote_connector = InvenioConnector(REMOTE_URL) _print("Getting records from: %s" % REMOTE_URL) recids = remote_connector.search(p=search_terms, cc=collection, of='id') _print("Found %d records on %s for search terms '%s' in collection '%s'" % (len(recids), REMOTE_INSTANCE, search_terms, collection)) return recids
def main(): usage = """ save to file: python fix_marc_record.py marc_file*.xml >> result_file.xml print to terminal: python fix_marc_record.py marc_file*.xml options: --recid -r fix the record with the given record id from https://inspireheptest.cern.ch e.g. python fix_marc_record.py --recid=1291107 --site -s specify a different site useful only when option --recid or -r enabled e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net """ try: opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="]) options = map(lambda a: a[0], opts) if not args and not ("-r" in options or "--recid" in options): raise getopt.GetoptError("Missing argument record to fix") except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" print(usage) sys.exit(2) if "-r" in options or "--recid" in options: from invenio.invenio_connector import InvenioConnector from xml.dom.minidom import parseString site = "http://inspireheptest.cern.ch/" for o, a in opts: if o in ["-s", "--site"]: site = a if o in ["-r", "--recid"]: recid = a inspiretest = InvenioConnector(site) record = inspiretest.search(p="001:%s" % recid, of="xm") marcxml = parseString(record) try: marcxml = marcxml.getElementsByTagName("record")[0] except IndexError: print("Record not found") sys.exit(2) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) marcxml = fix_fft(marcxml) sys.stdout.write(marcxml.toxml().encode("utf8")) else: print("<collection>") for filename in args: try: strip_bom(filename) marcxml = parse(filename) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) marcxml = fix_fft(marcxml) sys.stdout.write(marcxml.toxml().encode("utf8")) except Exception, err: print("ERROR with file %s: %s. Skipping file...." % (filename, err), file=sys.stderr) print("</collection>")
def main(): usage = """ save to file: python fix_marc_record.py marc_file*.xml >> result_file.xml print to terminal: python fix_marc_record.py marc_file*.xml options: --recid -r fix the record with the given record id from https://inspireheptest.cern.ch e.g. python fix_marc_record.py --recid=1291107 --site -s specify a different site useful only when option --recid or -r enabled e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net """ try: opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="]) options = map(lambda a: a[0], opts) if not args and not ('-r' in options or '--recid' in options): raise getopt.GetoptError("Missing argument record to fix") except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" print(usage) sys.exit(2) if '-r' in options or '--recid' in options: from invenio.invenio_connector import InvenioConnector from xml.dom.minidom import parseString site = "http://inspireheptest.cern.ch/" for o, a in opts: if o in ['-s', '--site']: site = a if o in ['-r', '--recid']: recid = a inspiretest = InvenioConnector(site) record = inspiretest.search(p='001:%s' % recid, of='xm') marcxml = parseString(record) try: marcxml = marcxml.getElementsByTagName('record')[0] except IndexError: print("Record not found") sys.exit(2) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) marcxml = fix_fft(marcxml) sys.stdout.write(marcxml.toxml().encode('utf8')) else: print("<collection>") for filename in args: try: strip_bom(filename) marcxml = parse(filename) marcxml = fix_authors(marcxml) marcxml = fix_title(marcxml) marcxml = fix_fft(marcxml) sys.stdout.write(marcxml.toxml().encode('utf8')) except Exception, err: print("ERROR with file %s: %s. Skipping file...." % (filename, err), file=sys.stderr) print("</collection>")
def test_search_collections(self): """InvenioConnector - collection search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p="", c=["Books"], of="id") self.assertTrue(len(result) > 0, "did not get collection search results.")
def test_remote_search(self): """InvenioConnector - remote search""" server = InvenioConnector("http://inspirebeta.net") result = server.search(p="ellis", of="id") self.assertTrue(len(result) > 0, "did not get remote search results from http://inspirebeta.net.")
def test_local_search(self): """InvenioConnector - local search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p="ellis", of="id") self.assertTrue(len(result) > 0, "did not get local search results.")
def test_search_collections(self): """InvenioConnector - collection search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p='', c=['Books'], of='id') self.assertTrue(len(result) > 0, \ 'did not get collection search results.')
def test_remote_search(self): """InvenioConnector - remote search""" server = InvenioConnector("http://invenio-demo.cern.ch") result = server.search(p='ellis', of='id') self.assertTrue(len(result) > 0, \ 'did not get remote search results from http://invenio-demo.cern.ch')
def test_local_search(self): """InvenioConnector - local search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p='ellis', of='id') self.assertTrue(len(result) > 0, \ 'did not get local search results.')
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \ operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0): """ Match passed records with existing records on a local or remote Invenio installation. Returns which records are new (no match), which are matched, which are ambiguous and which are fuzzy-matched. A formatted result of each records matching are appended to each record tuple: (record, status_code, list_of_errors, result) @param records: records to analyze @type records: list of records @param qrystrs: Querystrings @type qrystrs: list of object @param server_url: which server to search on. Local installation by default @type server_url: str @param perform_request_search_mode: run the query in this mode @type perform_request_search_mode: string @param operator: "o" "a" @type operator: str @param verbose: be loud @type verbose: int @param modify: output modified records of matches @type modify: int @rtype: list of lists @return an array of arrays of records, like this [newrecs,matchedrecs, ambiguousrecs,fuzzyrecs] """ server = InvenioConnector(server_url) newrecs = [] matchedrecs = [] ambiguousrecs = [] fuzzyrecs = [] record_counter = 0 for rec in records: record_counter += 1 if (verbose > 1): sys.stderr.write("\n Processing record: #%d .." % record_counter) if qrystrs == None: qrystrs = [] if len(qrystrs)==0: qrystrs.append("") more_detailed_info = "" for qrystr in qrystrs: querystring = Querystring() querystring.default() if(qrystr != ""): querystring.from_qrystr(qrystr, perform_request_search_mode, operator) else: querystring.default() querystring.search_engine_encode() ### get field values for record instance inst = [] ### get appropriate fields from database for field in querystring.field: tags = get_field_tags(field) if len(tags) > 0: # Fetch value from input record of first tag only # FIXME: Extracting more then first tag, evaluating each field = tags[0] ### use expanded tags tag = field[0:3] ind1 = field[3:4] ind2 = field[4:5] code = field[5:6] if((ind1 == "_")or(ind1 == "%")): ind1 = "" if((ind2 == "_")or(ind2 == "%")): ind2 = "" if((code == "_")or(code == "%")): code = "a" if(field != "001"): finsts = record_get_field_instances(rec[0], tag, ind1, ind2) sbf = get_subfield(finsts, code) inst.append(sbf) elif(field in ["001"]): sbf = record_get_field_values(rec[0], field, ind1="", ind2="", code="") inst.append(sbf) else: inst.append("") ### format acquired field values i = 0 for instance in inst: for format in querystring.format[i]: inst[i] = bibconvert.FormatField(inst[i], format) i += 1 ### perform the search if(inst[0] != ""): p1 = inst[0] f1 = querystring.field[0] m1 = querystring.mode[0] op1 = querystring.operator[0] p2 = inst[1] f2 = querystring.field[1] m2 = querystring.mode[1] op2 = querystring.operator[1] p3 = inst[2] f3 = querystring.field[2] m3 = querystring.mode[2] #1st run the basic perform_req_search recID_list = server.search( p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, of='id') if (verbose > 8): sys.stderr.write("\nperform_request_search with values"+\ " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\ " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\ " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\ " result="+str(recID_list)+"\n") if len(recID_list) > 1: #ambig match ambiguousrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring, "ambiguous-matched"), )) if (verbose > 8): sys.stderr.write("ambiguous\n") if len(recID_list) == 1: #match if modify: if record_has_field(rec[0], '001'): record_modify_controlfield(rec[0], '001', \ controlfield_value=str(recID_list[0]), \ field_position_global=1) else: record_add_field(rec[0], '001', controlfield_value=str(recID_list[0])) matchedrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring, "exact-matched"), )) if (verbose > 8): sys.stderr.write("match\n") if len(recID_list) == 0: #no match.. #try fuzzy matching intersected = None #check if all the words appear in the #field of interest words1 = main_words_list(p1) words2 = main_words_list(p2) words3 = main_words_list(p3) for word in words1: word = "'"+word+"'" ilist = server.search(p=word, f=f1, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) for word in words2: word = "'"+word+"'" ilist = server.search(p=word, f=f2, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) for word in words3: word = "'"+word+"'" ilist = server.search(p=word, f=f3, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) if intersected: #this was a fuzzy match if modify: if record_has_field(rec[0], '001'): record_modify_controlfield(rec[0], '001', \ controlfield_value=str(intersected[0]), field_position_global=1) else: record_add_field(rec[0], '001', controlfield_value=str(intersected[0])) fuzzyrecs.append(rec + (match_result_output(intersected, \ server_url, querystring, "fuzzy-matched"), )) if (verbose > 8): sys.stderr.write("fuzzy\n") else: #no match newrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring), )) if (verbose > 8): sys.stderr.write("new\n") #return results return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
def test_search_collections(self): """InvenioConnector - collection search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p='', c=['Books'], of='id') self.assertTrue(len(result) > 0, \ 'did not get collection search results.')
def test_remote_search(self): """InvenioConnector - remote search""" server = InvenioConnector("http://invenio-demo.cern.ch") result = server.search(p='ellis', of='id') self.assertTrue(len(result) > 0, \ 'did not get remote search results from http://invenio-demo.cern.ch')
def test_local_search(self): """InvenioConnector - local search""" server = InvenioConnector(CFG_SITE_URL) result = server.search(p='ellis', of='id') self.assertTrue(len(result) > 0, \ 'did not get local search results.')
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \ operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0): """ Match passed records with existing records on a local or remote Invenio installation. Returns which records are new (no match), which are matched, which are ambiguous and which are fuzzy-matched. A formatted result of each records matching are appended to each record tuple: (record, status_code, list_of_errors, result) @param records: records to analyze @type records: list of records @param qrystrs: Querystrings @type qrystrs: list of object @param server_url: which server to search on. Local installation by default @type server_url: str @param perform_request_search_mode: run the query in this mode @type perform_request_search_mode: string @param operator: "o" "a" @type operator: str @param verbose: be loud @type verbose: int @param modify: output modified records of matches @type modify: int @rtype: list of lists @return an array of arrays of records, like this [newrecs,matchedrecs, ambiguousrecs,fuzzyrecs] """ server = InvenioConnector(server_url) newrecs = [] matchedrecs = [] ambiguousrecs = [] fuzzyrecs = [] record_counter = 0 for rec in records: record_counter += 1 if (verbose > 1): sys.stderr.write("\n Processing record: #%d .." % record_counter) if qrystrs == None: qrystrs = [] if len(qrystrs)==0: qrystrs.append("") more_detailed_info = "" for qrystr in qrystrs: querystring = Querystring() querystring.default() if(qrystr != ""): querystring.from_qrystr(qrystr, perform_request_search_mode, operator) else: querystring.default() querystring.search_engine_encode() ### get field values for record instance inst = [] ### get appropriate fields from database for field in querystring.field: tags = get_field_tags(field) if len(tags) > 0: # Fetch value from input record of first tag only # FIXME: Extracting more then first tag, evaluating each field = tags[0] ### use expanded tags tag = field[0:3] ind1 = field[3:4] ind2 = field[4:5] code = field[5:6] if((ind1 == "_")or(ind1 == "%")): ind1 = "" if((ind2 == "_")or(ind2 == "%")): ind2 = "" if((code == "_")or(code == "%")): code = "a" if(field != "001"): finsts = record_get_field_instances(rec[0], tag, ind1, ind2) sbf = get_subfield(finsts, code) inst.append(sbf) elif(field in ["001"]): sbf = record_get_field_values(rec[0], field, ind1="", ind2="", code="") inst.append(sbf) else: inst.append("") ### format acquired field values i = 0 for instance in inst: for format in querystring.format[i]: inst[i] = bibconvert.FormatField(inst[i], format) i += 1 ### perform the search if(inst[0] != ""): p1 = inst[0] f1 = querystring.field[0] m1 = querystring.mode[0] op1 = querystring.operator[0] p2 = inst[1] f2 = querystring.field[1] m2 = querystring.mode[1] op2 = querystring.operator[1] p3 = inst[2] f3 = querystring.field[2] m3 = querystring.mode[2] #1st run the basic perform_req_search recID_list = server.search( p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, p3=p3, f3=f3, m3=m3, of='id') if (verbose > 8): sys.stderr.write("\nperform_request_search with values"+\ " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\ " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\ " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\ " result="+str(recID_list)+"\n") if len(recID_list) > 1: #ambig match ambiguousrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring, "ambiguous-matched"), )) if (verbose > 8): sys.stderr.write("ambiguous\n") if len(recID_list) == 1: #match if modify: if record_has_field(rec[0], '001'): record_modify_controlfield(rec[0], '001', \ controlfield_value=str(recID_list[0]), \ field_position_global=1) else: record_add_field(rec[0], '001', controlfield_value=str(recID_list[0])) matchedrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring, "exact-matched"), )) if (verbose > 8): sys.stderr.write("match\n") if len(recID_list) == 0: #no match.. #try fuzzy matching intersected = None #check if all the words appear in the #field of interest words1 = main_words_list(p1) words2 = main_words_list(p2) words3 = main_words_list(p3) for word in words1: word = "'"+word+"'" ilist = server.search(p=word, f=f1, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) for word in words2: word = "'"+word+"'" ilist = server.search(p=word, f=f2, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) for word in words3: word = "'"+word+"'" ilist = server.search(p=word, f=f3, of="id") if (verbose > 8): sys.stderr.write("fuzzy perform_request_search with values"+\ " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n") if intersected == None: intersected = ilist intersected = list(set(ilist)&set(intersected)) if intersected: #this was a fuzzy match if modify: if record_has_field(rec[0], '001'): record_modify_controlfield(rec[0], '001', \ controlfield_value=str(intersected[0]), field_position_global=1) else: record_add_field(rec[0], '001', controlfield_value=str(intersected[0])) fuzzyrecs.append(rec + (match_result_output(intersected, \ server_url, querystring, "fuzzy-matched"), )) if (verbose > 8): sys.stderr.write("fuzzy\n") else: #no match newrecs.append(rec + (match_result_output(recID_list, \ server_url, querystring), )) if (verbose > 8): sys.stderr.write("new\n") #return results return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]