def are_zbl_records_similar(rec1, rec2):
    """Returns true if (we believe that) rec1 describes the same data as rec2.
    
    The same data has the same zbl-id or mr-id or very similar authors and title
    and the same publication year."""
    if rec1.has_key(zbl_io.ZBL_ID_FIELD) and rec2.has_key(zbl_io.ZBL_ID_FIELD):
        if rec1[zbl_io.ZBL_ID_FIELD] == rec2[zbl_io.ZBL_ID_FIELD]:
            return True
    
    if rec1.has_key("zb") and rec2.has_key("zb"):
        return rec1["zb"] == rec2["zb"]
        
    if rec1.has_key("mr") and rec2.has_key("mr"):
        return rec1["mr"] == rec2["mr"]
        
    #if present publication years must agree        
    if rec1.has_key("py") and rec2.has_key("py"):        
        if rec1["py"] != rec2["py"]: 
            return False
    
    if rec1.has_key("au") and rec2.has_key("au"):
        au1 = zbl_io.unpack_multivalue_field(rec1["au"]);
        au2 = zbl_io.unpack_multivalue_field(rec2["au"]);
        if not are_lists_almost_equal(au1, au2, 4, 2):
            return False
    else: #both articles must have authors 
        return False 
            
    #are titles similar?            
    ti1 = rec1.get('ti', '').lower()
    ti2 = rec2.get('ti', '').lower()        
    if not are_elements_almost_equal(ti1, ti2, 6, 4):
        return False                
    
    return True
Exemplo n.º 2
0
def _update_zbl_record_history_(main_zbl_record, aux_zbl_record):
    """Updates field <zz> in main_zbl_record basing on its previous value and <zz> value in aux_zbl_record.

    <zz> - field that identifies records's source (should be merged instead of overwriting.)
    """
    if main_zbl_record.has_key("zz") and aux_zbl_record.has_key("zz"): 
        main_zz_list = zbl_io.unpack_multivalue_field(main_zbl_record["zz"])
        aux_zz_list =  zbl_io.unpack_multivalue_field(aux_zbl_record["zz"])
        main_zz_list.extend(aux_zz_list)
        main_zbl_record["zz"] = zbl_io.pack_multivalue_field(main_zz_list)
    return main_zbl_record
Exemplo n.º 3
0
def _update_zbl_record_history_(main_zbl_record, aux_zbl_record):
    """Updates field <zz> in main_zbl_record basing on its previous value and <zz> value in aux_zbl_record.

    <zz> - field that identifies records's source (should be merged instead of overwriting.)
    """
    if main_zbl_record.has_key("zz") and aux_zbl_record.has_key("zz"):
        main_zz_list = zbl_io.unpack_multivalue_field(main_zbl_record["zz"])
        aux_zz_list = zbl_io.unpack_multivalue_field(aux_zbl_record["zz"])
        main_zz_list.extend(aux_zz_list)
        main_zbl_record["zz"] = zbl_io.pack_multivalue_field(main_zz_list)
    return main_zbl_record
def calc_msc2count(fin, src_field='mc'):
    """Returns msc2counts dictionary."""
    msc2count = {};
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[calc_msc_model] "+str(i)+" records processed")
        if not src_field in record: continue
        
        msccodes = zbl_io.unpack_multivalue_field(record[src_field])
        for msc in msccodes:
            msc2count[msc] = msc2count.get(msc, 0) + 1
        
        #zbl_io.write_zbl_record(fout, record)
        #fout.write("\n")            
    return msc2count  
def calc_msc2count(fin, src_field='mc'):
    """Returns msc2counts dictionary."""
    msc2count = {}
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[calc_msc_model] " + str(i) + " records processed")
        if not src_field in record: continue

        msccodes = zbl_io.unpack_multivalue_field(record[src_field])
        for msc in msccodes:
            msc2count[msc] = msc2count.get(msc, 0) + 1

        #zbl_io.write_zbl_record(fout, record)
        #fout.write("\n")
    return msc2count
def filter_af(fin, fout):
    """Copies records from fin to fout but also removes from records empty (only "-" values) af fields.
    
    Returns number of removed fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key("af"):
            af = zbl_io.unpack_multivalue_field(record["af"])
            empty = sum(1 for a in af if a == '-') == len(af)
            if empty:
                record.pop("af")
                counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def filter_af(fin, fout):
    """Copies records from fin to fout but also removes from records empty (only "-" values) af fields.
    
    Returns number of removed fields.
    """
    counter = 0
    for record in zbl_io.read_zbl_records(fin):
        if record.has_key("af"):
            af = zbl_io.unpack_multivalue_field(record["af"])
            empty = sum(1 for a in af if a == '-') == len(af)
            if empty:
                record.pop("af")
                counter = counter + 1
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter
def calc_msc_membership(fin, fout, known_msc_codes, \
                        src_field='mc', dst_field='m0', \
                        re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'):
    """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field."""
    msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern)
    msc2ix = calc_msc2ix(msccodes)
    ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems())
    prefix2msc = group_by_prefix(msccodes)

    counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[calc_msc_membership] " + str(i) +
                         " records processed. " + str(counter) + "updated.")
        if src_field in record:
            record_msccodes = zbl_io.unpack_multivalue_field(record[src_field])
            record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern)

            compared_codes = set(
            )  #patrzymy po tych ktore maja zgodne fragmenty prefiksow
            for record_msccode in record_msccodes:
                prefix2 = record_msccode[:2]
                prefix3 = record_msccode[:3]
                compared_codes.update(prefix2msc[prefix2])
                compared_codes.update(prefix2msc[prefix3])

            mscmembership = []
            for compared_code in compared_codes:
                membership = msccode_membership(record_msccodes, compared_code)
                mscmembership.append((msc2ix[compared_code], membership))

            if len(mscmembership) > 0:  #zapsiujemy wyniki
                mscmembership = sorted(set(mscmembership))
                record[dst_field] = zbl_io.pack_listpairs_field(mscmembership)
                record[dbg_field] = zbl_io.pack_listpairs_field([
                    (ix2msc[ix], m) for ix, m in mscmembership
                ])
                counter = counter + 1

        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def calc_msc_membership(fin, fout, known_msc_codes, \
                        src_field='mc', dst_field='m0', \
                        re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'):
    """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field."""
    msccodes    = filter_msccodes(known_msc_codes, re_leaf_pattern)
    msc2ix      = calc_msc2ix(msccodes)
    ix2msc      = dict((ix,msc) for msc,ix in msc2ix.iteritems())
    prefix2msc  = group_by_prefix(msccodes)    
        
    counter = 0;
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[calc_msc_membership] "+str(i)+" records processed. "+str(counter)+"updated.")
        if src_field in record:         
            record_msccodes = zbl_io.unpack_multivalue_field(record[src_field])
            record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern)
            
            compared_codes = set() #patrzymy po tych ktore maja zgodne fragmenty prefiksow
            for record_msccode in record_msccodes:                
                prefix2 = record_msccode[:2]
                prefix3 = record_msccode[:3]
                compared_codes.update( prefix2msc[prefix2] )
                compared_codes.update( prefix2msc[prefix3] )

            mscmembership = []
            for compared_code in compared_codes:
                membership = msccode_membership(record_msccodes, compared_code)
                mscmembership.append( (msc2ix[compared_code],membership) )
                    
            if len(mscmembership) > 0: #zapsiujemy wyniki
                mscmembership = sorted(set(mscmembership))
                record[dst_field] = zbl_io.pack_listpairs_field(mscmembership)                
                record[dbg_field] = zbl_io.pack_listpairs_field([(ix2msc[ix],m) for ix,m in mscmembership])
                counter = counter + 1
            
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter