Exemplo n.º 1
0
def write_ds_names(dsfile, mye2name2freq, nameoutfile, max_name=10, fb_ent2name=None):
    (e2types, t2ents, e2freq) = load_dataset(dsfile)
    f = open(nameoutfile, 'w')
    for mye in e2types:
        outstr = '\t'.join([mye, e2types[mye][0], ' '.join(e2types[mye]), str(e2freq[mye])])
        if mye in mye2name2freq:
            name2freq = mye2name2freq[mye]
            sorted_by_freq = sorted(name2freq.items(), key=operator.itemgetter(1), reverse=True)
#             sorted_by_freq = sorted(name2freq, key=name2freq.get, reverse=True)
            c = 0
            outstr += '\t####\t'
            for name, freq in sorted_by_freq:
                outstr += '\t'.join([name, str(freq)])
                outstr += '\t'
                if c == max_name:
                    break
                c += 1
        elif fb_ent2name and mye in fb_ent2name:
            outstr += '\t####\t'
            for c in range(max_name):
                outstr += '\t'.join([name, '1000'])
                outstr += '\t'
        else:
            print mye
        f.write(outstr + '\n')
    f.close()
Exemplo n.º 2
0
def write_ds_names(dsfile,
                   mye2name2freq,
                   nameoutfile,
                   max_name=10,
                   fb_ent2name=None):
    (e2types, t2ents, e2freq) = load_dataset(dsfile)
    f = open(nameoutfile, 'w')
    for mye in e2types:
        outstr = '\t'.join(
            [mye, e2types[mye][0], ' '.join(e2types[mye]),
             str(e2freq[mye])])
        if mye in mye2name2freq:
            name2freq = mye2name2freq[mye]
            sorted_by_freq = sorted(name2freq.items(),
                                    key=operator.itemgetter(1),
                                    reverse=True)
            #             sorted_by_freq = sorted(name2freq, key=name2freq.get, reverse=True)
            c = 0
            outstr += '\t####\t'
            for name, freq in sorted_by_freq:
                outstr += '\t'.join([name, str(freq)])
                outstr += '\t'
                if c == max_name:
                    break
                c += 1
        elif fb_ent2name and mye in fb_ent2name:
            outstr += '\t####\t'
            for c in range(max_name):
                outstr += '\t'.join([name, '1000'])
                outstr += '\t'
        else:
            print mye
        f.write(outstr + '\n')
    f.close()
def extend_dstypes_to_parents(fig2parents, entdsdir, sampledlines_dir, outdir, edsname, linesname):
    e2types, t2ents = load_dataset(entdsdir + edsname, logger)
    e2lines, t2lines_train, e2freq = load_lines(sampledlines_dir + linesname, e2types, -1)
    e2types_extended = extend_write_eds(fig2parents, e2types, outdir + edsname, e2freq)
    writelines(e2lines,outdir + linesname, e2types_extended)
    f = open(outdir + '/ds_lines_figer', 'w')
    logger.info('write test lines in %s', outdir + 'ds_lines_figer')
    for myline in lines:
        f.write(myline.strip())
        f.write('\n')
    f.close()
        

if __name__ == '__main__':
    fbname2figer = laod_figermapping('/nfs/data3/yadollah/nlptools_resources/figer/config/types.map')
    fig2parents = load_parents('/nfs/data3/yadollah/nlptools_resources/figer/config/yy_type2parents')
    mid2name = load_type_names('/nfs/data1/proj/yadollah/cluewebwork/nlu/dataForImport/type.name')
    logger.info('size types with names: %d', len(mid2name))
    dsfile = sys.argv[1]
    ds_linesfile = sys.argv[2]
    outdir = sys.argv[3]
    
    (e2types, t2ents, e2freq) = load_dataset(dsfile, logger)
    logger.info(len(e2types))
    
    newConvertedDs = filter_write_ds(e2types, fbname2figer, mid2name, outdir, fig2parents)
    logger.info('size of dataset after filtering to figer type: %d', len(newConvertedDs))
    
    (new_dslines, e2freq) = filter_ds_lines(ds_linesfile, newConvertedDs, outdir)
    logger.info('#lines after filtering to figer type: %d', len(new_dslines))
    write_ds(newConvertedDs, e2freq, outdir + 'Eds_figer')
    write_lines(new_dslines, e2freq, outdir)