Exemplo n.º 1
0
def build_wordsmodel(fin, fout, src_field = "g0"):
    """Returns ({wordid:number-of-occurrences-in-whole-corpus}, {wordid:number-of-docs-that-contain-this-word}, numdocs)."""
    wordsmodel = WordsModel()        
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%500 == 0: logging.info("[build_wordsmodel] "+str(i)+" records processed")
        if src_field in record: 
            doc_wordid2count = _di_( zbl_io.unpack_dictionary_field(record[src_field]) )    
            wordsmodel.update(doc_wordid2count)
    wordsmodel.finish_updates()    
    return wordsmodel
Exemplo n.º 2
0
def map_wordsmodel_overall_weighting(fin, fout, wordsmodel, src_field="g0", dst_field="g1",\
                                     weight = lambda wordsmodel,doc_wordid2count,wordid: tf(doc_wordid2count, wordid)*idf(wordsmodel, wordid) ):
    """Maps value of src_field using wordsmodel and weigting function. Results stores to dst_field."""
    counter = 0    
    for i,record in enumerate(zbl_io.read_zbl_records(fin)):                            
        if i%100 == 0: logging.info("[map_wordsmodel_overall_weighting] "+str(i)+" records processed."+str(counter)+"enriched.")
        if src_field in record:                
            doc_wordid2count    = _di_( zbl_io.unpack_dictionary_field(record[src_field]) )
            doc_wordid2weight   = [( wordid,weight(wordsmodel,doc_wordid2count,wordid) ) for wordid,count in doc_wordid2count.iteritems() ]
            record[dst_field]   = zbl_io.pack_listpairs_field( sorted( doc_wordid2weight ) )
            counter = counter + 1 
        zbl_io.write_zbl_record(fout, record)
        fout.write("\n")
    return counter