def gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token={}, dbg_field_name = "g_"):
    """For record, that have filter_by_fields, fields are merged, 
       mapped with gensim dictionary and stored in dst-field."""
    has_filter_by_fields = sum(1 for field in filter_by_fields if field in record) == len(filter_by_fields)
    if has_filter_by_fields: 
        fields_list_of_words = reduce(lambda w1,w2: w1+w2, (record[field].split() for field in fields if field in record) )            
        fields_words_ids = dictionary.doc2bow(fields_list_of_words)
        logging.debug("[gensim_mapfields_dict]"+str(fields_list_of_words)+" -> "+str(fields_words_ids))
        if len(fields_words_ids) > 0:
            record[dst_field] = zbl_io.pack_listpairs_field( fields_words_ids )
            record[dbg_field_name] = zbl_io.pack_listpairs_field( (idx,id2token.get(idx,'?')) for idx,count in fields_words_ids ) #this-line is for debugging purposes
    return record    
def calc_msc_membership(fin, fout, known_msc_codes, \
                        src_field='mc', dst_field='m0', \
                        re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'):
    """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field."""
    msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern)
    msc2ix = calc_msc2ix(msccodes)
    ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems())
    prefix2msc = group_by_prefix(msccodes)

    counter = 0
    for i, record in enumerate(zbl_io.read_zbl_records(fin)):
        if i % 10000 == 0:
            logging.info("[calc_msc_membership] " + str(i) +
                         " records processed. " + str(counter) + "updated.")
        if src_field in record:
            record_msccodes = zbl_io.unpack_multivalue_field(record[src_field])
            record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern)

            compared_codes = set(
            )  #patrzymy po tych ktore maja zgodne fragmenty prefiksow
            for record_msccode in record_msccodes:
                prefix2 = record_msccode[:2]
                prefix3 = record_msccode[:3]
                compared_codes.update(prefix2msc[prefix2])
                compared_codes.update(prefix2msc[prefix3])

            mscmembership = []
            for compared_code in compared_codes:
                membership = msccode_membership(record_msccodes, compared_code)
                mscmembership.append((msc2ix[compared_code], membership))

            if len(mscmembership) > 0:  #zapsiujemy wyniki
                mscmembership = sorted(set(mscmembership))
                record[dst_field] = zbl_io.pack_listpairs_field(mscmembership)
                record[dbg_field] = zbl_io.pack_listpairs_field([
                    (ix2msc[ix], m) for ix, m in mscmembership
                ])
                counter = counter + 1

        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter
def calc_msc_membership(fin, fout, known_msc_codes, \
                        src_field='mc', dst_field='m0', \
                        re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'):
    """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field."""
    msccodes    = filter_msccodes(known_msc_codes, re_leaf_pattern)
    msc2ix      = calc_msc2ix(msccodes)
    ix2msc      = dict((ix,msc) for msc,ix in msc2ix.iteritems())
    prefix2msc  = group_by_prefix(msccodes)    
        
    counter = 0;
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%10000 == 0: logging.info("[calc_msc_membership] "+str(i)+" records processed. "+str(counter)+"updated.")
        if src_field in record:         
            record_msccodes = zbl_io.unpack_multivalue_field(record[src_field])
            record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern)
            
            compared_codes = set() #patrzymy po tych ktore maja zgodne fragmenty prefiksow
            for record_msccode in record_msccodes:                
                prefix2 = record_msccode[:2]
                prefix3 = record_msccode[:3]
                compared_codes.update( prefix2msc[prefix2] )
                compared_codes.update( prefix2msc[prefix3] )

            mscmembership = []
            for compared_code in compared_codes:
                membership = msccode_membership(record_msccodes, compared_code)
                mscmembership.append( (msc2ix[compared_code],membership) )
                    
            if len(mscmembership) > 0: #zapsiujemy wyniki
                mscmembership = sorted(set(mscmembership))
                record[dst_field] = zbl_io.pack_listpairs_field(mscmembership)                
                record[dbg_field] = zbl_io.pack_listpairs_field([(ix2msc[ix],m) for ix,m in mscmembership])
                counter = counter + 1
            
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")            
    return counter            
예제 #4
0
def gensim_mapfields_dict(record,
                          fields,
                          filter_by_fields,
                          dictionary,
                          dst_field,
                          id2token={},
                          dbg_field_name="g_"):
    """For record, that have filter_by_fields, fields are merged, 
       mapped with gensim dictionary and stored in dst-field."""
    has_filter_by_fields = sum(1 for field in filter_by_fields
                               if field in record) == len(filter_by_fields)
    if has_filter_by_fields:
        fields_list_of_words = reduce(lambda w1, w2: w1 + w2,
                                      (record[field].split()
                                       for field in fields if field in record))
        fields_words_ids = dictionary.doc2bow(fields_list_of_words)
        logging.debug("[gensim_mapfields_dict]" + str(fields_list_of_words) +
                      " -> " + str(fields_words_ids))
        if len(fields_words_ids) > 0:
            record[dst_field] = zbl_io.pack_listpairs_field(fields_words_ids)
            record[dbg_field_name] = zbl_io.pack_listpairs_field(
                (idx, id2token.get(idx, '?')) for idx, count in
                fields_words_ids)  #this-line is for debugging purposes
    return record