def test_search_local_restricted_collections(self):
        """InvenioConnector - local restricted collection search"""
        server = InvenioConnector(CFG_SITE_URL)
        search_params = dict(p="LBL-28106", c=["Theses"], of="id")
        self.assertRaises(InvenioConnectorAuthError, server.search, **search_params)

        server = InvenioConnector(CFG_SITE_SECURE_URL, user="******", password="")
        result = server.search(p="LBL-28106", c=["Theses"], of="id")
        self.assertTrue(len(result) > 0, "did not get restricted collection search results.")
示例#2
0
def main():
    usage = """
    save to file:
    python fix_marc_record.py marc_file.xml >> result_file.xml

    print to terminal:
    python fix_marc_record.py marc_file.xml

    options:
    --recid -r
    fix the record with the given record id from https://inspireheptest.cern.ch
    e.g. python fix_marc_record.py --recid=1291107
    --site -s
    specify a different site useful only when option --recid or -r enabled
    e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net
    """
    try:
        opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="])
        options = map(lambda a: a[0], opts)
        if len(args) > 1:
            raise getopt.GetoptError("Too many arguments given!!!")
        elif not args and not ('-r' in options or '--recid' in options):
            raise getopt.GetoptError("Missing argument record to fix")
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        print(usage)
        sys.exit(2)

    if '-r' in options or '--recid' in options:
        from invenio.invenio_connector import InvenioConnector
        from xml.dom.minidom import parseString
        site = "http://inspireheptest.cern.ch/"
        for o, a in opts:
            if o in ['-s', '--site']:
                site = a
            if o in ['-r', '--recid']:
                recid = a
        inspiretest = InvenioConnector(site)
        record = inspiretest.search(p='001:%s' % recid, of='xm')
        marcxml = parseString(record)
        try:
            marcxml = marcxml.getElementsByTagName('record')[0]
        except IndexError:
            print("Record not found")
            sys.exit(2)

        marcxml = fix_authors(marcxml)
        marcxml = fix_title(marcxml)

        print(marcxml.toxml())
    else:
        filename = args[0]
        marcxml = parse(filename)

        marcxml = fix_authors(marcxml)
        marcxml = fix_title(marcxml)
        print(marcxml.toxml())
    def test_search_remote_restricted_collections(self):
        """InvenioConnector - remote restricted collection search"""
        server = InvenioConnector("http://invenio-demo.cern.ch")
        search_params = dict(p="LBL-28106", c=["Theses"], of="id")
        self.assertRaises(InvenioConnectorAuthError, server.search, **search_params)

        server = InvenioConnector("https://invenio-demo.cern.ch", user="******", password="******")
        result = server.search(p="LBL-28106", c=["Theses"], of="id")
        self.assertTrue(len(result) > 0, "did not get restricted collection search results.")
    def test_search_local_restricted_collections(self):
        """InvenioConnector - local restricted collection search"""
        server = InvenioConnector(CFG_SITE_URL)
        search_params = dict(p='LBL-28106', c=['Theses'], of='id')
        self.assertRaises(InvenioConnectorAuthError, server.search, **search_params)

        server = InvenioConnector(CFG_SITE_SECURE_URL, user='******', password='')
        result = server.search(p='LBL-28106', c=['Theses'], of='id')
        self.assertTrue(len(result) > 0, \
                        'did not get restricted collection search results.')
    def test_search_remote_restricted_collections(self):
        """InvenioConnector - remote restricted collection search"""
        server = InvenioConnector("http://invenio-demo.cern.ch")
        search_params = dict(p='LBL-28106', c=['Theses'], of='id')
        self.assertRaises(InvenioConnectorAuthError, server.search, **search_params)

        server = InvenioConnector("https://invenio-demo.cern.ch", user='******', password='******')
        result = server.search(p='LBL-28106', c=['Theses'], of='id')
        self.assertTrue(len(result) > 0, \
                        'did not get restricted collection search results.')
def get_recid_from_sysno(server_url, sysno):
    """
    This function will look for a record with sysno on server - server_url
    and return the record id
    """
    server = InvenioConnector(server_url)
    rec = server.search_with_retry(p="970:%s" % (sysno.strip(),), of='id')
    print rec
    try:
        recid = str(rec[0])
    except (KeyError, IndexError):
        return ""
    return recid
示例#7
0
def get_reference_number(tarball, refno_url):
    """
    Attempts to determine the reference number of the file by searching.

    @param: tarball (string): the name of the tarball as downloaded from
        arXiv
    @param: refno_url (string): url of repository to check for a
        reference number for this record. If not set; returns None

    @return: refno (string): the reference number of the paper
    """
    if refno_url:
        server = InvenioConnector(refno_url)
        # we just need the name of the file
        tarball = os.path.split(tarball)[1]
        prefix = '037__a:'
        # the name right now looks like arXiv:hep-ph_9703009
        # or arXiv:0910.0476
        if tarball.startswith(ARXIV_HEADER):
            if len(tarball.split('_')) > 1:
                tarball = tarball.split(':')[1]
                arXiv_record = tarball.replace('_', '/')
            else:
                arXiv_record = tarball

            result = server.search(p=prefix + arXiv_record, of='id')

            if len(result) == 0:
                return None

            return str(result[0])

        arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', tarball)
        if len(arXiv_record) > 1:
            arXiv_record = arXiv_record[0]
            result = server.search(p=prefix + arXiv_record, of='id')

            if len(result) > 0:
                return str(result[0])

        tarball_mod = tarball.replace('_', '/')
        arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', \
                                  tarball_mod)
        if len(arXiv_record) > 1:
            arXiv_record = arXiv_record[0]
            result = server.search(p=prefix + arXiv_record, of='id')

            if len(result) > 0:
                return str(result[0])
    return None
def get_remote_ids(search_terms, collection=''):
    """ Retreives IDs from the remote instance of records which have a
    corresponding ID in the 035 field to the local instance.

    Parameters:
     (string) search_terms - what to search for remotely
    Returns:
     A list of RecIDs
    """
    remote_connector = InvenioConnector(REMOTE_URL)
    _print("Getting records from: %s" % REMOTE_URL)
    recids = remote_connector.search(p=search_terms, cc=collection, of='id')
    _print("Found %d records on %s for search terms '%s' in collection '%s'"
           % (len(recids), REMOTE_INSTANCE, search_terms, collection))
    return recids
def retrieve_records(results):
    last_url = ""
    records = []
    search_params = dict(p="", of="xm")
    for url, recid in results:
        if url != last_url:
            server = InvenioConnector(url)
        search_params["p"] = "001:%s" % (recid,)
        res = server.search_with_retry(**search_params)
	time.sleep(1.0)
        if res != []:
            records.append(create_records(res)[0])
        else:
            print "Problem with record: %s" % (recid,)
    return records
示例#10
0
def get_remote_ids(search_terms, collection=''):
    """ Retreives IDs from the remote instance of records which have a
    corresponding ID in the 035 field to the local instance.

    Parameters:
     (string) search_terms - what to search for remotely
    Returns:
     A list of RecIDs
    """
    remote_connector = InvenioConnector(REMOTE_URL)
    _print("Getting records from: %s" % REMOTE_URL)
    recids = remote_connector.search(p=search_terms, cc=collection, of='id')
    _print("Found %d records on %s for search terms '%s' in collection '%s'" %
           (len(recids), REMOTE_INSTANCE, search_terms, collection))
    return recids
def get_sysno_from_recid(server_url, recid):
    """
    This function will look for a record with record ID - recid on server - server_url
    and return the system number - sysno
    """
    server = InvenioConnector(server_url)
    rec = server.search_with_retry(p="001:%s" % (recid,))
    try:
        sysno = rec[0][970][0]['a'][0]
    except (KeyError, IndexError):
        return None

    if 'SPIRES' in sysno:
        sysno = sysno.split("-")[1]
    elif 'CER' in sysno:
        sysno = sysno.split("CER")[0]
    return sysno
 def test_remote_search(self):
     """InvenioConnector - remote search"""
     server = InvenioConnector("http://invenio-demo.cern.ch")
     result = server.search(p='ellis', of='id')
     self.assertTrue(len(result) > 0, \
                     'did not get remote search results from http://invenio-demo.cern.ch')
 def test_local_search(self):
     """InvenioConnector - local search"""
     server = InvenioConnector(CFG_SITE_URL)
     result = server.search(p='ellis', of='id')
     self.assertTrue(len(result) > 0, \
                     'did not get local search results.')
示例#14
0
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \
                  operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0):
    """ Match passed records with existing records on a local or remote Invenio
    installation. Returns which records are new (no match), which are matched,
    which are ambiguous and which are fuzzy-matched. A formatted result of each
    records matching are appended to each record tuple:
    (record, status_code, list_of_errors, result)

    @param records: records to analyze
    @type records: list of records

    @param qrystrs: Querystrings
    @type qrystrs: list of object

    @param server_url: which server to search on. Local installation by default
    @type server_url: str

    @param perform_request_search_mode: run the query in this mode
    @type perform_request_search_mode: string

    @param operator: "o" "a"
    @type operator: str

    @param verbose: be loud
    @type verbose: int

    @param modify: output modified records of matches
    @type modify: int

    @rtype: list of lists
    @return an array of arrays of records, like this [newrecs,matchedrecs,
                                                      ambiguousrecs,fuzzyrecs]
    """

    server = InvenioConnector(server_url)

    newrecs = []
    matchedrecs = []
    ambiguousrecs = []
    fuzzyrecs = []

    record_counter = 0
    for rec in records:
        record_counter += 1
        if (verbose > 1):
            sys.stderr.write("\n Processing record: #%d .." % record_counter)

        if qrystrs == None:
            qrystrs = []

        if len(qrystrs)==0:
            qrystrs.append("")

        more_detailed_info = ""

        for qrystr in qrystrs:
            querystring = Querystring()
            querystring.default()

            if(qrystr != ""):
                querystring.from_qrystr(qrystr,
                                        perform_request_search_mode,
                                        operator)
            else:
                querystring.default()

            querystring.search_engine_encode()

            ### get field values for record instance

            inst = []

            ### get appropriate fields from database
            for field in querystring.field:
                tags = get_field_tags(field)
                if len(tags) > 0:
                    # Fetch value from input record of first tag only
                    # FIXME: Extracting more then first tag, evaluating each
                    field = tags[0]
                ### use expanded tags
                tag  = field[0:3]
                ind1 = field[3:4]
                ind2 = field[4:5]
                code = field[5:6]

                if((ind1 == "_")or(ind1 == "%")):
                    ind1 = ""
                if((ind2 == "_")or(ind2 == "%")):
                    ind2 = ""
                if((code == "_")or(code == "%")):
                    code = "a"

                if(field != "001"):
                    finsts = record_get_field_instances(rec[0], tag, ind1, ind2)
                    sbf = get_subfield(finsts, code)
                    inst.append(sbf)
                elif(field in ["001"]):
                    sbf = record_get_field_values(rec[0], field, ind1="",
                                                  ind2="", code="")
                    inst.append(sbf)
                else:
                    inst.append("")


            ### format acquired field values

            i = 0
            for instance in inst:
                for format in querystring.format[i]:
                    inst[i] = bibconvert.FormatField(inst[i], format)
                i += 1

            ### perform the search

            if(inst[0] != ""):
                p1 = inst[0]
                f1 = querystring.field[0]
                m1 = querystring.mode[0]
                op1 = querystring.operator[0]

                p2 = inst[1]
                f2 = querystring.field[1]
                m2 = querystring.mode[1]
                op2 = querystring.operator[1]

                p3 = inst[2]
                f3 = querystring.field[2]
                m3 = querystring.mode[2]

                #1st run the basic perform_req_search
                recID_list = server.search(
                    p1=p1, f1=f1, m1=m1, op1=op1,
                    p2=p2, f2=f2, m2=m2, op2=op2,
                    p3=p3, f3=f3, m3=m3, of='id')

                if (verbose > 8):
                    sys.stderr.write("\nperform_request_search with values"+\
                     " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\
                     " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\
                     " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\
                     " result="+str(recID_list)+"\n")

                if len(recID_list) > 1: #ambig match
                    ambiguousrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "ambiguous-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("ambiguous\n")
                if len(recID_list) == 1: #match
                    if modify:
                        if record_has_field(rec[0], '001'):
                            record_modify_controlfield(rec[0], '001', \
                                                       controlfield_value=str(recID_list[0]), \
                                                       field_position_global=1)
                        else:
                            record_add_field(rec[0], '001', controlfield_value=str(recID_list[0]))
                    matchedrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "exact-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("match\n")
                if len(recID_list) == 0: #no match..
                    #try fuzzy matching
                    intersected = None
                    #check if all the words appear in the
                    #field of interest
                    words1 = main_words_list(p1)
                    words2 = main_words_list(p2)
                    words3 = main_words_list(p3)

                    for word in words1:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f1, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words2:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f2, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words3:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f3, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    if intersected:
                        #this was a fuzzy match
                        if modify:
                            if record_has_field(rec[0], '001'):
                                record_modify_controlfield(rec[0], '001', \
                                      controlfield_value=str(intersected[0]), field_position_global=1)
                            else:
                                record_add_field(rec[0], '001', controlfield_value=str(intersected[0]))
                        fuzzyrecs.append(rec + (match_result_output(intersected, \
                                                server_url, querystring, "fuzzy-matched"), ))
                        if (verbose > 8):
                            sys.stderr.write("fuzzy\n")
                    else:
                        #no match
                        newrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring), ))
                        if (verbose > 8):
                            sys.stderr.write("new\n")
    #return results
    return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
示例#15
0
def main():
    usage = """
    save to file:
    python fix_marc_record.py marc_file*.xml >> result_file.xml

    print to terminal:
    python fix_marc_record.py marc_file*.xml

    options:
    --recid -r
    fix the record with the given record id from https://inspireheptest.cern.ch
    e.g. python fix_marc_record.py --recid=1291107
    --site -s
    specify a different site useful only when option --recid or -r enabled
    e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net
    """
    try:
        opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="])
        options = map(lambda a: a[0], opts)
        if not args and not ('-r' in options or '--recid' in options):
            raise getopt.GetoptError("Missing argument record to fix")
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        print(usage)
        sys.exit(2)

    if '-r' in options or '--recid' in options:
        from invenio.invenio_connector import InvenioConnector
        from xml.dom.minidom import parseString
        site = "http://inspireheptest.cern.ch/"
        for o, a in opts:
            if o in ['-s', '--site']:
                site = a
            if o in ['-r', '--recid']:
                recid = a
        inspiretest = InvenioConnector(site)
        record = inspiretest.search(p='001:%s' % recid, of='xm')
        marcxml = parseString(record)
        try:
            marcxml = marcxml.getElementsByTagName('record')[0]
        except IndexError:
            print("Record not found")
            sys.exit(2)

        marcxml = fix_authors(marcxml)
        marcxml = fix_title(marcxml)
        marcxml = fix_fft(marcxml)

        sys.stdout.write(marcxml.toxml().encode('utf8'))
    else:
        print("<collection>")
        for filename in args:
            try:
                strip_bom(filename)
                marcxml = parse(filename)
                marcxml = fix_authors(marcxml)
                marcxml = fix_title(marcxml)
                marcxml = fix_fft(marcxml)
                sys.stdout.write(marcxml.toxml().encode('utf8'))
            except Exception, err:
                print("ERROR with file %s: %s. Skipping file...." %
                      (filename, err),
                      file=sys.stderr)
        print("</collection>")
def match_records(records, qrystrs=None, search_mode=None, operator="and", verbose=1, \
                  server_url=CFG_SITE_URL, modify=0, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, clean=False):
    """
    Match passed records with existing records on a local or remote Invenio
    installation. Returns which records are new (no match), which are matched,
    which are ambiguous and which are fuzzy-matched. A formatted result of each
    records matching are appended to each record tuple:
    (record, status_code, list_of_errors, result)

    @param records: records to analyze
    @type records: list of records

    @param qrystrs: list of tuples (field, querystring)
    @type qrystrs: list

    @param search_mode: if mode is given, the search will perform an advanced query using
                        the desired mode. Otherwise 'simple search' is used.
    @type search_mode: str

    @param operator: operator used to concatenate values of fields occurring more then once.
                     Valid types are: AND, OR. Defaults to AND.
    @type operator: str

    @param verbose: be loud
    @type verbose: int

    @param server_url: which server to search on. Local installation by default
    @type server_url: str

    @param modify: output modified records of matches
    @type modify: int

    @param sleeptime: amount of time to wait between each query
    @type sleeptime: float

    @rtype: list of lists
    @return an array of arrays of records, like this [newrecs,matchedrecs,
                                                      ambiguousrecs,fuzzyrecs]
    """
    server = InvenioConnector(server_url)

    newrecs = []
    matchedrecs = []
    ambiguousrecs = []
    fuzzyrecs = []

    ## Go through each record and try to find matches using defined querystrings
    record_counter = 0
    querystring = Querystring(operator, clean=clean)
    for rec in records:
        record_counter += 1
        if (verbose > 1):
            sys.stderr.write("\n Processing record: #%d .." % (record_counter,))

        # At least one (field, querystring) tuple is needed for default search query
        if not qrystrs:
            qrystrs = [("", "")]

        # Temporary store result(s) for each record
        matched_results = []
        ambiguous_results = []
        fuzzy_results = []
        # Go through each querystring, trying to find a matching record
        # Stops on first valid match, if no exact-match we continue with fuzzy match
        for field, qrystr in qrystrs:
            query, complete = querystring.create_query(rec[0], qrystr)
            if query == "":
                if (verbose > 1):
                    sys.stderr.write("\nEmpty query. Skipping...\n")
                # Empty query, no point searching database
                continue

            if not complete:
                if (verbose > 1):
                    sys.stderr.write("\nQuery not complete. Flagged as uncertain/ambiguous...\n")

            # Determine proper search parameters
            if search_mode != None:
                search_params = dict(p1=query, f1=field, m1=search_mode, of='id')
            else:
                search_params = dict(p=query, f=field, of='id')

            ## Perform the search with retries
            result_recids = server.search_with_retry(**search_params)
            if (verbose > 8):
                if len(result_recids) > 10:
                    sys.stderr.write("\nSearching with values %s result=%s\n" %
                                 (search_params, "More then 10 results..."))
                else:
                    sys.stderr.write("\nSearching with values %s result=%s\n" %
                                 (search_params, result_recids))
            sleep(sleeptime)
            ## Check results:
            # Ambiguous match
            if len(result_recids) > 1 and len(result_recids) < 11:
                ambiguous_results.append((result_recids, query))
                if (verbose > 8):
                    sys.stderr.write("Ambiguous\n")
            # Match
            elif len(result_recids) == 1:
                if modify:
                    add_recid(rec[0], result_recids[0])
                if complete:
                    matched_results.append((result_recids, query))
                    if (verbose > 8):
                        sys.stderr.write("Match\n")
                    # This was a complete match, so let's break out to avoid fuzzy search
                    break
                else:
                    # We treat the result as ambiguous (uncertain) when query is not complete
                    ambiguous_results.append((result_recids, query))
                    if (verbose > 8):
                        sys.stderr.write("Ambiguous\n")
            # No match
            else:
                if (verbose > 8):
                    sys.stderr.write("New (no matches)\n")
        # No complete matches, lets try fuzzy matching of all the queries
        else:
            ## Fuzzy matching: Analyze all queries and perform individual searches, then intersect results.
            for field, qrystr in qrystrs:
                query, complete = querystring.create_query(rec[0], qrystr)
                if query == "":
                    if (verbose > 1):
                        sys.stderr.write("\nEmpty query. Skipping...\n")
                    # Empty query, no point searching database
                    continue
                result_hitset = None
                fuzzy_query_list = querystring.fuzzy_queries()
                empty_results = 0
                # Go through every expression in the query and generate fuzzy searches
                for current_operator, qry in fuzzy_query_list:
                    current_resultset = None
                    search_params = dict(p=qry, f=field, of='id')
                    current_resultset = server.search_with_retry(**search_params)
                    if (verbose > 8):
                        if len(current_resultset) > 10:
                            sys.stderr.write("\nSearching with values %s result=%s\n" %
                                         (search_params, "More then 10 results..."))
                        else:
                            sys.stderr.write("\nSearching with values %s result=%s\n" %
                                         (search_params, current_resultset))
                    sleep(sleeptime)
                    if current_resultset == None:
                        continue
                    if current_resultset == [] and empty_results < CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT:
                        # Allows some empty results
                        empty_results += 1
                    else:
                        # Intersect results with previous results depending on current operator
                        if result_hitset == None:
                            result_hitset = current_resultset
                        if current_operator == '+':
                            result_hitset = list(set(result_hitset) & set(current_resultset))
                        elif current_operator == '-':
                            result_hitset = list(set(result_hitset) - set(current_resultset))
                        elif current_operator == '|':
                            result_hitset = list(set(result_hitset) | set(current_resultset))

                if result_hitset and len(result_hitset) < 10:
                    # This was a fuzzy match
                    query_out = " #Fuzzy# ".join([q for dummy, q in fuzzy_query_list])
                    if len(result_hitset) == 1 and complete:
                        if modify:
                            add_recid(rec[0], result_hitset[0])
                        fuzzy_results.append((result_hitset, query_out))
                        if (verbose > 8):
                            sys.stderr.write("Fuzzy: %s\n" % (result_hitset,))
                    else:
                        # We treat the result as ambiguous (uncertain) when:
                        # - query is not complete
                        # - more then one result
                        ambiguous_results.append((result_hitset, query_out))
                        if (verbose > 8):
                            sys.stderr.write("Ambiguous\n")

        ## Evaluate final results for record
        # Add matched record iff number found is equal to one, otherwise return fuzzy, ambiguous or no match
        if len(matched_results) == 1:
            results, query = matched_results[0]
            matchedrecs.append((rec[0], "<!-- BibMatch-Matching-Results: -->\n%s" % (match_result_output(results, server_url, \
                                                                                                         query, "exact-matched"))))
            if (verbose > 1):
                sys.stderr.write("Final result: match\n")
        else:
            if len(fuzzy_results) > 0:
                # Find common record-id for all fuzzy results and grab first query as "representative" query
                query = fuzzy_results[0][1]
                result_lists = []
                for res, dummy in fuzzy_results:
                    result_lists.extend(res)
                results = set([res for res in result_lists])
                fuzzyrecs.append((rec[0], "<!-- BibMatch-Matching-Results: -->\n%s" % (match_result_output(results, server_url, \
                                                                                            query, "fuzzy-matched"),)))
                if (verbose > 1):
                    sys.stderr.write("Final result: fuzzy\n")
            elif len(ambiguous_results) > 0:
                # Find common record-id for all ambiguous results and grab first query as "representative" query
                query = ambiguous_results[0][1]
                result_lists = []
                for res, dummy in ambiguous_results:
                    result_lists.extend(res)
                results = set([res for res in result_lists])
                ambiguousrecs.append((rec[0], "<!-- BibMatch-Matching-Results: -->\n%s" % (match_result_output(results, server_url, \
                                                                                            query, "ambiguous-matched"),)))
                if (verbose > 1):
                    sys.stderr.write("Final result: ambiguous\n")
            else:
                newrecs.append((rec[0], "<!-- BibMatch-Matching-Results: -->\n%s" % (match_result_output([], server_url, str(qrystrs)),)))
                if (verbose > 1):
                    sys.stderr.write("Final result: new\n")
    return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
 def test_remote_search(self):
     """InvenioConnector - remote search"""
     server = InvenioConnector("http://inspirebeta.net")
     result = server.search(p="ellis", of="id")
     self.assertTrue(len(result) > 0, "did not get remote search results from http://inspirebeta.net.")
 def test_search_collections(self):
     """InvenioConnector - collection search"""
     server = InvenioConnector(CFG_SITE_URL)
     result = server.search(p='', c=['Books'], of='id')
     self.assertTrue(len(result) > 0, \
                     'did not get collection search results.')
示例#19
0
def match_records(records, qrystrs=None, search_mode=None, operator="and", verbose=1, \
                  server_url=CFG_SITE_SECURE_URL, modify=0, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, \
                  clean=False, collections=[], user="", password=""):
    """
    Match passed records with existing records on a local or remote Invenio
    installation. Returns which records are new (no match), which are matched,
    which are ambiguous and which are fuzzy-matched. A formatted result of each
    records matching are appended to each record tuple:
    (record, status_code, list_of_errors, result)

    @param records: records to analyze
    @type records: list of records

    @param qrystrs: list of tuples (field, querystring)
    @type qrystrs: list

    @param search_mode: if mode is given, the search will perform an advanced query using
                        the desired mode. Otherwise 'simple search' is used.
    @type search_mode: str

    @param operator: operator used to concatenate values of fields occurring more then once.
                     Valid types are: AND, OR. Defaults to AND.
    @type operator: str

    @param verbose: be loud
    @type verbose: int

    @param server_url: which server to search on. Local installation by default
    @type server_url: str

    @param modify: output modified records of matches
    @type modify: int

    @param sleeptime: amount of time to wait between each query
    @type sleeptime: float

    @param clean: should the search queries be cleaned before passed them along?
    @type clean: bool

    @param collections: list of collections to search, if specified
    @type collections: list

    @param user: username in case of authenticated search requests
    @type user: string

    @param password: password in case of authenticated search requests
    @type password: string

    @rtype: list of lists
    @return an array of arrays of records, like this [newrecs,matchedrecs,
                                                      ambiguousrecs,fuzzyrecs]
    """
    newrecs = []
    matchedrecs = []
    ambiguousrecs = []
    fuzzyrecs = []

    try:
        server = InvenioConnector(server_url, user=user, password=password)
    except InvenioConnectorAuthError, error:
        if verbose > 0:
            sys.stderr.write("Authentication error when connecting to server: %s" \
                             % (str(error),))
        return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
示例#20
0
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \
                  operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0):
    """ Match passed records with existing records on a local or remote Invenio
    installation. Returns which records are new (no match), which are matched,
    which are ambiguous and which are fuzzy-matched. A formatted result of each
    records matching are appended to each record tuple:
    (record, status_code, list_of_errors, result)

    @param records: records to analyze
    @type records: list of records

    @param qrystrs: Querystrings
    @type qrystrs: list of object

    @param server_url: which server to search on. Local installation by default
    @type server_url: str

    @param perform_request_search_mode: run the query in this mode
    @type perform_request_search_mode: string

    @param operator: "o" "a"
    @type operator: str

    @param verbose: be loud
    @type verbose: int

    @param modify: output modified records of matches
    @type modify: int

    @rtype: list of lists
    @return an array of arrays of records, like this [newrecs,matchedrecs,
                                                      ambiguousrecs,fuzzyrecs]
    """

    server = InvenioConnector(server_url)

    newrecs = []
    matchedrecs = []
    ambiguousrecs = []
    fuzzyrecs = []

    record_counter = 0
    for rec in records:
        record_counter += 1
        if (verbose > 1):
            sys.stderr.write("\n Processing record: #%d .." % record_counter)

        if qrystrs == None:
            qrystrs = []

        if len(qrystrs)==0:
            qrystrs.append("")

        more_detailed_info = ""

        for qrystr in qrystrs:
            querystring = Querystring()
            querystring.default()

            if(qrystr != ""):
                querystring.from_qrystr(qrystr,
                                        perform_request_search_mode,
                                        operator)
            else:
                querystring.default()

            querystring.search_engine_encode()

            ### get field values for record instance

            inst = []

            ### get appropriate fields from database
            for field in querystring.field:
                tags = get_field_tags(field)
                if len(tags) > 0:
                    # Fetch value from input record of first tag only
                    # FIXME: Extracting more then first tag, evaluating each
                    field = tags[0]
                ### use expanded tags
                tag  = field[0:3]
                ind1 = field[3:4]
                ind2 = field[4:5]
                code = field[5:6]

                if((ind1 == "_")or(ind1 == "%")):
                    ind1 = ""
                if((ind2 == "_")or(ind2 == "%")):
                    ind2 = ""
                if((code == "_")or(code == "%")):
                    code = "a"

                if(field != "001"):
                    finsts = record_get_field_instances(rec[0], tag, ind1, ind2)
                    sbf = get_subfield(finsts, code)
                    inst.append(sbf)
                elif(field in ["001"]):
                    sbf = record_get_field_values(rec[0], field, ind1="",
                                                  ind2="", code="")
                    inst.append(sbf)
                else:
                    inst.append("")


            ### format acquired field values

            i = 0
            for instance in inst:
                for format in querystring.format[i]:
                    inst[i] = bibconvert.FormatField(inst[i], format)
                i += 1

            ### perform the search

            if(inst[0] != ""):
                p1 = inst[0]
                f1 = querystring.field[0]
                m1 = querystring.mode[0]
                op1 = querystring.operator[0]

                p2 = inst[1]
                f2 = querystring.field[1]
                m2 = querystring.mode[1]
                op2 = querystring.operator[1]

                p3 = inst[2]
                f3 = querystring.field[2]
                m3 = querystring.mode[2]

                #1st run the basic perform_req_search
                recID_list = server.search(
                    p1=p1, f1=f1, m1=m1, op1=op1,
                    p2=p2, f2=f2, m2=m2, op2=op2,
                    p3=p3, f3=f3, m3=m3, of='id')

                if (verbose > 8):
                    sys.stderr.write("\nperform_request_search with values"+\
                     " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\
                     " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\
                     " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\
                     " result="+str(recID_list)+"\n")

                if len(recID_list) > 1: #ambig match
                    ambiguousrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "ambiguous-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("ambiguous\n")
                if len(recID_list) == 1: #match
                    if modify:
                        if record_has_field(rec[0], '001'):
                            record_modify_controlfield(rec[0], '001', \
                                                       controlfield_value=str(recID_list[0]), \
                                                       field_position_global=1)
                        else:
                            record_add_field(rec[0], '001', controlfield_value=str(recID_list[0]))
                    matchedrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "exact-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("match\n")
                if len(recID_list) == 0: #no match..
                    #try fuzzy matching
                    intersected = None
                    #check if all the words appear in the
                    #field of interest
                    words1 = main_words_list(p1)
                    words2 = main_words_list(p2)
                    words3 = main_words_list(p3)

                    for word in words1:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f1, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words2:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f2, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words3:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f3, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    if intersected:
                        #this was a fuzzy match
                        if modify:
                            if record_has_field(rec[0], '001'):
                                record_modify_controlfield(rec[0], '001', \
                                      controlfield_value=str(intersected[0]), field_position_global=1)
                            else:
                                record_add_field(rec[0], '001', controlfield_value=str(intersected[0]))
                        fuzzyrecs.append(rec + (match_result_output(intersected, \
                                                server_url, querystring, "fuzzy-matched"), ))
                        if (verbose > 8):
                            sys.stderr.write("fuzzy\n")
                    else:
                        #no match
                        newrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring), ))
                        if (verbose > 8):
                            sys.stderr.write("new\n")
    #return results
    return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
 def test_local_search(self):
     """InvenioConnector - local search"""
     server = InvenioConnector(CFG_SITE_URL)
     result = server.search(p="ellis", of="id")
     self.assertTrue(len(result) > 0, "did not get local search results.")
 def test_local_search(self):
     """InvenioConnector - local search"""
     server = InvenioConnector(CFG_SITE_URL)
     result = server.search(p='ellis', of='id')
     self.assertTrue(len(result) > 0, \
                     'did not get local search results.')
 def test_search_collections(self):
     """InvenioConnector - collection search"""
     server = InvenioConnector(CFG_SITE_URL)
     result = server.search(p="", c=["Books"], of="id")
     self.assertTrue(len(result) > 0, "did not get collection search results.")
 def test_remote_search(self):
     """InvenioConnector - remote search"""
     server = InvenioConnector("http://invenio-demo.cern.ch")
     result = server.search(p='ellis', of='id')
     self.assertTrue(len(result) > 0, \
                     'did not get remote search results from http://invenio-demo.cern.ch')
 def test_search_collections(self):
     """InvenioConnector - collection search"""
     server = InvenioConnector(CFG_SITE_URL)
     result = server.search(p='', c=['Books'], of='id')
     self.assertTrue(len(result) > 0, \
                     'did not get collection search results.')
示例#26
0
def main():
    usage = """
    save to file:
    python fix_marc_record.py marc_file*.xml >> result_file.xml

    print to terminal:
    python fix_marc_record.py marc_file*.xml

    options:
    --recid -r
    fix the record with the given record id from https://inspireheptest.cern.ch
    e.g. python fix_marc_record.py --recid=1291107
    --site -s
    specify a different site useful only when option --recid or -r enabled
    e.g. python fix_marc_record.py -r 1291107 -s http://inspirehep.net
    """
    try:
        opts, args = getopt.getopt(sys.argv[1:], "r:s:", ["recid=", "site="])
        options = map(lambda a: a[0], opts)
        if not args and not ("-r" in options or "--recid" in options):
            raise getopt.GetoptError("Missing argument record to fix")
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        print(usage)
        sys.exit(2)

    if "-r" in options or "--recid" in options:
        from invenio.invenio_connector import InvenioConnector
        from xml.dom.minidom import parseString

        site = "http://inspireheptest.cern.ch/"
        for o, a in opts:
            if o in ["-s", "--site"]:
                site = a
            if o in ["-r", "--recid"]:
                recid = a
        inspiretest = InvenioConnector(site)
        record = inspiretest.search(p="001:%s" % recid, of="xm")
        marcxml = parseString(record)
        try:
            marcxml = marcxml.getElementsByTagName("record")[0]
        except IndexError:
            print("Record not found")
            sys.exit(2)

        marcxml = fix_authors(marcxml)
        marcxml = fix_title(marcxml)
        marcxml = fix_fft(marcxml)

        sys.stdout.write(marcxml.toxml().encode("utf8"))
    else:
        print("<collection>")
        for filename in args:
            try:
                strip_bom(filename)
                marcxml = parse(filename)
                marcxml = fix_authors(marcxml)
                marcxml = fix_title(marcxml)
                marcxml = fix_fft(marcxml)
                sys.stdout.write(marcxml.toxml().encode("utf8"))
            except Exception, err:
                print("ERROR with file %s: %s. Skipping file...." % (filename, err), file=sys.stderr)
        print("</collection>")