def gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token={}, dbg_field_name = "g_"): """For record, that have filter_by_fields, fields are merged, mapped with gensim dictionary and stored in dst-field.""" has_filter_by_fields = sum(1 for field in filter_by_fields if field in record) == len(filter_by_fields) if has_filter_by_fields: fields_list_of_words = reduce(lambda w1,w2: w1+w2, (record[field].split() for field in fields if field in record) ) fields_words_ids = dictionary.doc2bow(fields_list_of_words) logging.debug("[gensim_mapfields_dict]"+str(fields_list_of_words)+" -> "+str(fields_words_ids)) if len(fields_words_ids) > 0: record[dst_field] = zbl_io.pack_listpairs_field( fields_words_ids ) record[dbg_field_name] = zbl_io.pack_listpairs_field( (idx,id2token.get(idx,'?')) for idx,count in fields_words_ids ) #this-line is for debugging purposes return record
def calc_msc_membership(fin, fout, known_msc_codes, \ src_field='mc', dst_field='m0', \ re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'): """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field.""" msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern) msc2ix = calc_msc2ix(msccodes) ix2msc = dict((ix, msc) for msc, ix in msc2ix.iteritems()) prefix2msc = group_by_prefix(msccodes) counter = 0 for i, record in enumerate(zbl_io.read_zbl_records(fin)): if i % 10000 == 0: logging.info("[calc_msc_membership] " + str(i) + " records processed. " + str(counter) + "updated.") if src_field in record: record_msccodes = zbl_io.unpack_multivalue_field(record[src_field]) record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern) compared_codes = set( ) #patrzymy po tych ktore maja zgodne fragmenty prefiksow for record_msccode in record_msccodes: prefix2 = record_msccode[:2] prefix3 = record_msccode[:3] compared_codes.update(prefix2msc[prefix2]) compared_codes.update(prefix2msc[prefix3]) mscmembership = [] for compared_code in compared_codes: membership = msccode_membership(record_msccodes, compared_code) mscmembership.append((msc2ix[compared_code], membership)) if len(mscmembership) > 0: #zapsiujemy wyniki mscmembership = sorted(set(mscmembership)) record[dst_field] = zbl_io.pack_listpairs_field(mscmembership) record[dbg_field] = zbl_io.pack_listpairs_field([ (ix2msc[ix], m) for ix, m in mscmembership ]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def calc_msc_membership(fin, fout, known_msc_codes, \ src_field='mc', dst_field='m0', \ re_leaf_pattern=msc_processing.MSC_ORDINARY_LEAF_PATTERN_RE, dbg_field='m_'): """Updates records with additional dst_field with membership vector calculated basing on msc codes read from src_field.""" msccodes = filter_msccodes(known_msc_codes, re_leaf_pattern) msc2ix = calc_msc2ix(msccodes) ix2msc = dict((ix,msc) for msc,ix in msc2ix.iteritems()) prefix2msc = group_by_prefix(msccodes) counter = 0; for i,record in enumerate(zbl_io.read_zbl_records(fin)): if i%10000 == 0: logging.info("[calc_msc_membership] "+str(i)+" records processed. "+str(counter)+"updated.") if src_field in record: record_msccodes = zbl_io.unpack_multivalue_field(record[src_field]) record_msccodes = filter_msccodes(record_msccodes, re_leaf_pattern) compared_codes = set() #patrzymy po tych ktore maja zgodne fragmenty prefiksow for record_msccode in record_msccodes: prefix2 = record_msccode[:2] prefix3 = record_msccode[:3] compared_codes.update( prefix2msc[prefix2] ) compared_codes.update( prefix2msc[prefix3] ) mscmembership = [] for compared_code in compared_codes: membership = msccode_membership(record_msccodes, compared_code) mscmembership.append( (msc2ix[compared_code],membership) ) if len(mscmembership) > 0: #zapsiujemy wyniki mscmembership = sorted(set(mscmembership)) record[dst_field] = zbl_io.pack_listpairs_field(mscmembership) record[dbg_field] = zbl_io.pack_listpairs_field([(ix2msc[ix],m) for ix,m in mscmembership]) counter = counter + 1 zbl_io.write_zbl_record(fout, record) fout.write("\n") return counter
def gensim_mapfields_dict(record, fields, filter_by_fields, dictionary, dst_field, id2token={}, dbg_field_name="g_"): """For record, that have filter_by_fields, fields are merged, mapped with gensim dictionary and stored in dst-field.""" has_filter_by_fields = sum(1 for field in filter_by_fields if field in record) == len(filter_by_fields) if has_filter_by_fields: fields_list_of_words = reduce(lambda w1, w2: w1 + w2, (record[field].split() for field in fields if field in record)) fields_words_ids = dictionary.doc2bow(fields_list_of_words) logging.debug("[gensim_mapfields_dict]" + str(fields_list_of_words) + " -> " + str(fields_words_ids)) if len(fields_words_ids) > 0: record[dst_field] = zbl_io.pack_listpairs_field(fields_words_ids) record[dbg_field_name] = zbl_io.pack_listpairs_field( (idx, id2token.get(idx, '?')) for idx, count in fields_words_ids) #this-line is for debugging purposes return record