示例#1
0
 def test_limw_default(self):
     """bibconvert - LIMW(,)"""
     test_input = "ab cd xx 12 34"
     self.assertEqual(test_input,
                      bibconvert.FormatField(test_input, "LIMW(,)"))
     self.assertEqual(test_input,
                      bibconvert.FormatField(test_input, "LIMW(,R)"))
 def test_limw_left_regex(self):
     """bibconvert - LIMW(c,L) with regular expression"""
     test_input = "ab cd xx 12 34"
     self.assertEqual("ab ",
             bibconvert.FormatField(test_input, "LIMW(//\s//,R)"))
     self.assertEqual(test_input,
             bibconvert.FormatField(test_input, "LIMW(//[!_-]//,R)"))
示例#3
0
 def test_words_right(self):
     """bibconvert - WORDS(n,R)"""
     test_input = "ab cd xx 12 34"
     self.assertEqual("ab cd",
                      bibconvert.FormatField(test_input, "WORDS(2,R)"))
     test_input = "Sep 1999"
     self.assertEqual("Sep", bibconvert.FormatField(test_input,
                                                    "WORDS(1,R)"))
示例#4
0
 def test_words_left(self):
     """bibconvert - WORDS(n,L)"""
     test_input = "ab cd xx 12 34"
     self.assertEqual("12 34",
                      bibconvert.FormatField(test_input, "WORDS(2,L)"))
     test_input = "Sep 1999"
     self.assertEqual("1999",
                      bibconvert.FormatField(test_input, "WORDS(1,L)"))
示例#5
0
 def test_limw_right(self):
     """bibconvert - LIMW(c,R)"""
     test_input = "ab cd xx 12 34"
     self.assertEqual("ab ", bibconvert.FormatField(test_input,
                                                    "LIMW( ,R)"))
     test_input = "sep_1999"
     self.assertEqual("sep_",
                      bibconvert.FormatField(test_input, "LIMW(_,R)"))
示例#6
0
 def test_lim_left(self):
     """bibconvert - LIM(n,L)"""
     test_input = "ab cd xx 12 34"
     self.assertEqual("2 34",
                      bibconvert.FormatField(test_input, "LIM(4,L)"))
     test_input = "sep_1999"
     self.assertEqual("1999",
                      bibconvert.FormatField(test_input, "LIM(4,L)"))
示例#7
0
 def test_ff_regex(self):
     """bibconvert - formatting functions with regular expression"""
     self.assertEqual(
         "Hello world!",
         bibconvert.FormatField("Hellx wyrld!", "REP(//[xy]//,o)"))
     self.assertEqual(
         "Hello world!",
         bibconvert.FormatField("Hello world!", "REP(//[abc]//,o)"))
     self.assertEqual(
         "Hello world!",
         bibconvert.FormatField("Hello world! @", "EXP(//[@_]//,1)"))
     self.assertEqual(
         "Hello world!",
         bibconvert.FormatField("Hello world! abc", "EXP(//[oz]+//,0)"))
     self.assertEqual(
         "Hello world!",
         bibconvert.FormatField("Hello world!", "EXP(//[abc]+//,1)"))
     self.assertEqual(
         "lala",
         bibconvert.FormatField("Hello world!",
                                "IF(//^Hello .*!$//,lala,lolo)"))
     self.assertEqual(
         "lolo",
         bibconvert.FormatField("Hello world!",
                                "IF(//^Hello .*x$//,lala,lolo)"))
示例#8
0
    def create_query(self, record, qrystr="[title]"):
        """
        Main method that parses and generates a search query from
        given query-string structure and record data. Returns the
        resulting query-string and completeness determination as a tuple.

        @param record: bibrecord to retrive field-values from
        @type record: dict

        @param qrystr: proper query string template. (i.e. title:[245__a])
                       defaults to: [title]
        @type qrystr: str

        @return: (query-string, complete flag)
        @rtype: tuple
        """
        if qrystr == "":
            qrystr = "[title]"
        if "||" in qrystr or not "[" in qrystr:
            # Assume old style query-strings
            qrystr = self._convert_qrystr(qrystr)

        # FIXME: Convert to lower case, we do this to account for fuzzy_parser
        # which treats everything lower-case, and may cause KeyError when
        # retrieving data from the self.fields dict.
        # Also BibConvert formats are currently case sensitive.
        self.pattern = qrystr.lower()
        self.fields = {}
        complete = True
        fieldtags_found = []
        # Find all potential references to record tag values and
        # add to fields-dict as a list of values using fieldname as key
        for field_reference in re_querystring.findall(qrystr):
            # First we see if there is any special formats for this field_reference
            # This is done before transforming to lower case, as BibConvert formats
            # are case-sensitive
            fieldname = self._extract_formats(field_reference)
            self.pattern = self.pattern.replace(
                "[%s]" % (field_reference.lower(), ), "[%s]" % (fieldname, ))

            # Find proper MARC tag(s) for the fieldname
            tag_list = get_field_tags_from_fieldname(fieldname)
            if len(tag_list) == 0:
                tag_list = [fieldname]
            for field in tag_list:
                # Check if it is really a reference to a tag to not confuse with e.g. regex syntax
                if re_valid_tag.match(field) != None:
                    tag = field[0:3]
                    ind1 = field[3:4]
                    ind2 = field[4:5]
                    code = field[5:6]
                    if ind1 == "_" or ind1 == "%":
                        ind1 = ""
                    if ind2 == "_" or ind2 == "%":
                        ind2 = ""
                    value_list = record_get_field_values(
                        record, tag, ind1, ind2, code)
                    for value in value_list:
                        if value.strip() != "":
                            # Apply formats if applicable
                            for aformat in self.formats.get(fieldname, []):
                                value = bibconvert.FormatField(value, aformat)
                            self.fields.setdefault(fieldname, []).append(
                                (fieldname, value))
                    # Add fieldname to found tags, so we can check completeness later
                    fieldtags_found.append(fieldname)

        # Is the query deemed complete? i.e. did we find data for all field-name references
        complete = not bool(
            [n for n in fieldtags_found if n not in self.fields])

        # Now determine the Cartesian product over all found values,
        # then iterate over each combination to generate proper query
        all_queries = []
        query_tuples = cproduct(self.fields.values())
        for query in query_tuples:
            new_query = self.pattern
            for fieldname, value in query:
                new_query = new_query.replace("[%s]" % (fieldname, ), value)
            all_queries.append(new_query)

        # Finally we concatenate all queries into one, delimited by chosen operator
        self.query = self.operator.join(set(all_queries))
        if not complete:
            # Clean away field-name references not found
            for fieldtag in fieldtags_found:
                self.query = self.query.replace("[%s]" % (fieldtag, ), "")

        # Clean query?
        if self.clean:
            self._clean_query()
        return self.query, complete
示例#9
0
    def test_gff(self):
        """bibconvert - global formatting functions"""

        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world!", "DEFP()"))
示例#10
0
    def test_ff(self):
        """bibconvert - formatting functions"""

        self.assertEqual("Hello world!",
                         bibconvert.FormatField("ello world", "ADD(H,!)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world", "ABR(11,!)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("xHello world!x", "CUT(x,x)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("He11o wor1d!", "REP(1,l)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world!", "SUP(NUM)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world!", "LIM(12,R)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world!", "WORDS(2)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world!", "MINL(5)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world!", "MAXL(12)"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world! @", "EXP(@,1)"))
        self.assertEqual(
            "Hello world!",
            bibconvert.FormatField("Hello world!", "IF(Hello world!,ORIG,)"))
        self.assertEqual("", bibconvert.FormatField("Hello world!", "NUM()"))
        self.assertEqual("Hello world!",
                         bibconvert.FormatField("Hello world!   ", "SHAPE()"))
        self.assertEqual("HELLO WORLD!",
                         bibconvert.FormatField("Hello world!", "UP()"))
        self.assertEqual("hello world!",
                         bibconvert.FormatField("Hello world!", "DOWN()"))
        self.assertEqual("Hello World!",
                         bibconvert.FormatField("Hello world!", "CAP()"))
示例#11
0
 def test_words_exceed_wordcount(self):
     """bibconvert - WORDS(2,R) when less then 2 words in value"""
     test_input = "ab"
     self.assertEqual(test_input,
                      bibconvert.FormatField(test_input, "WORDS(2,R)"))
示例#12
0
 def test_words_default(self):
     """bibconvert - WORDS(,)"""
     test_input = "ab cd xx 12 34"
     self.assertEqual(test_input,
                      bibconvert.FormatField(test_input, "WORDS(,)"))
示例#13
0
 def test_limw_left(self):
     """bibconvert - LIMW(c,L)"""
     test_input = "ab cd xx 12 34"
     self.assertEqual(" cd xx 12 34",
                      bibconvert.FormatField(test_input, "LIMW( ,L)"))
示例#14
0
    def test_regexp(self):
        """bibconvert - regular expressions"""

        self.assertEqual(
            "Hello world!",
            bibconvert.FormatField("Hello world!", "RE([A-Z][a-z].*!)"))
示例#15
0
def match_records(records, qrystrs=None, perform_request_search_mode="eee", \
                  operator="a", verbose=1, server_url=CFG_SITE_URL, modify=0):
    """ Match passed records with existing records on a local or remote Invenio
    installation. Returns which records are new (no match), which are matched,
    which are ambiguous and which are fuzzy-matched. A formatted result of each
    records matching are appended to each record tuple:
    (record, status_code, list_of_errors, result)

    @param records: records to analyze
    @type records: list of records

    @param qrystrs: Querystrings
    @type qrystrs: list of object

    @param server_url: which server to search on. Local installation by default
    @type server_url: str

    @param perform_request_search_mode: run the query in this mode
    @type perform_request_search_mode: string

    @param operator: "o" "a"
    @type operator: str

    @param verbose: be loud
    @type verbose: int

    @param modify: output modified records of matches
    @type modify: int

    @rtype: list of lists
    @return an array of arrays of records, like this [newrecs,matchedrecs,
                                                      ambiguousrecs,fuzzyrecs]
    """

    server = InvenioConnector(server_url)

    newrecs = []
    matchedrecs = []
    ambiguousrecs = []
    fuzzyrecs = []

    record_counter = 0
    for rec in records:
        record_counter += 1
        if (verbose > 1):
            sys.stderr.write("\n Processing record: #%d .." % record_counter)

        if qrystrs == None:
            qrystrs = []

        if len(qrystrs)==0:
            qrystrs.append("")

        more_detailed_info = ""

        for qrystr in qrystrs:
            querystring = Querystring()
            querystring.default()

            if(qrystr != ""):
                querystring.from_qrystr(qrystr,
                                        perform_request_search_mode,
                                        operator)
            else:
                querystring.default()

            querystring.search_engine_encode()

            ### get field values for record instance

            inst = []

            ### get appropriate fields from database
            for field in querystring.field:
                tags = get_field_tags(field)
                if len(tags) > 0:
                    # Fetch value from input record of first tag only
                    # FIXME: Extracting more then first tag, evaluating each
                    field = tags[0]
                ### use expanded tags
                tag  = field[0:3]
                ind1 = field[3:4]
                ind2 = field[4:5]
                code = field[5:6]

                if((ind1 == "_")or(ind1 == "%")):
                    ind1 = ""
                if((ind2 == "_")or(ind2 == "%")):
                    ind2 = ""
                if((code == "_")or(code == "%")):
                    code = "a"

                if(field != "001"):
                    finsts = record_get_field_instances(rec[0], tag, ind1, ind2)
                    sbf = get_subfield(finsts, code)
                    inst.append(sbf)
                elif(field in ["001"]):
                    sbf = record_get_field_values(rec[0], field, ind1="",
                                                  ind2="", code="")
                    inst.append(sbf)
                else:
                    inst.append("")


            ### format acquired field values

            i = 0
            for instance in inst:
                for format in querystring.format[i]:
                    inst[i] = bibconvert.FormatField(inst[i], format)
                i += 1

            ### perform the search

            if(inst[0] != ""):
                p1 = inst[0]
                f1 = querystring.field[0]
                m1 = querystring.mode[0]
                op1 = querystring.operator[0]

                p2 = inst[1]
                f2 = querystring.field[1]
                m2 = querystring.mode[1]
                op2 = querystring.operator[1]

                p3 = inst[2]
                f3 = querystring.field[2]
                m3 = querystring.mode[2]

                #1st run the basic perform_req_search
                recID_list = server.search(
                    p1=p1, f1=f1, m1=m1, op1=op1,
                    p2=p2, f2=f2, m2=m2, op2=op2,
                    p3=p3, f3=f3, m3=m3, of='id')

                if (verbose > 8):
                    sys.stderr.write("\nperform_request_search with values"+\
                     " p1="+str(p1)+" f1="+str(f1)+" m1="+str(m1)+" op1="+str(op1)+\
                     " p2="+str(p2)+" f2="+str(f2)+" m2="+str(m2)+" op2="+str(op2)+\
                     " p3="+str(p3)+" f3="+str(f3)+" m3="+str(m3)+\
                     " result="+str(recID_list)+"\n")

                if len(recID_list) > 1: #ambig match
                    ambiguousrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "ambiguous-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("ambiguous\n")
                if len(recID_list) == 1: #match
                    if modify:
                        if record_has_field(rec[0], '001'):
                            record_modify_controlfield(rec[0], '001', \
                                                       controlfield_value=str(recID_list[0]), \
                                                       field_position_global=1)
                        else:
                            record_add_field(rec[0], '001', controlfield_value=str(recID_list[0]))
                    matchedrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring, "exact-matched"), ))
                    if (verbose > 8):
                        sys.stderr.write("match\n")
                if len(recID_list) == 0: #no match..
                    #try fuzzy matching
                    intersected = None
                    #check if all the words appear in the
                    #field of interest
                    words1 = main_words_list(p1)
                    words2 = main_words_list(p2)
                    words3 = main_words_list(p3)

                    for word in words1:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f1, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f1)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words2:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f2, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f2)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    for word in words3:
                        word = "'"+word+"'"
                        ilist = server.search(p=word, f=f3, of="id")
                        if (verbose > 8):
                            sys.stderr.write("fuzzy perform_request_search with values"+\
                                             " p="+str(word)+" f="+str(f3)+" res "+str(ilist)+"\n")
                        if intersected == None:
                            intersected = ilist
                        intersected =  list(set(ilist)&set(intersected))

                    if intersected:
                        #this was a fuzzy match
                        if modify:
                            if record_has_field(rec[0], '001'):
                                record_modify_controlfield(rec[0], '001', \
                                      controlfield_value=str(intersected[0]), field_position_global=1)
                            else:
                                record_add_field(rec[0], '001', controlfield_value=str(intersected[0]))
                        fuzzyrecs.append(rec + (match_result_output(intersected, \
                                                server_url, querystring, "fuzzy-matched"), ))
                        if (verbose > 8):
                            sys.stderr.write("fuzzy\n")
                    else:
                        #no match
                        newrecs.append(rec + (match_result_output(recID_list, \
                                                server_url, querystring), ))
                        if (verbose > 8):
                            sys.stderr.write("new\n")
    #return results
    return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]