def write_ds_names(dsfile, mye2name2freq, nameoutfile, max_name=10, fb_ent2name=None): (e2types, t2ents, e2freq) = load_dataset(dsfile) f = open(nameoutfile, 'w') for mye in e2types: outstr = '\t'.join([mye, e2types[mye][0], ' '.join(e2types[mye]), str(e2freq[mye])]) if mye in mye2name2freq: name2freq = mye2name2freq[mye] sorted_by_freq = sorted(name2freq.items(), key=operator.itemgetter(1), reverse=True) # sorted_by_freq = sorted(name2freq, key=name2freq.get, reverse=True) c = 0 outstr += '\t####\t' for name, freq in sorted_by_freq: outstr += '\t'.join([name, str(freq)]) outstr += '\t' if c == max_name: break c += 1 elif fb_ent2name and mye in fb_ent2name: outstr += '\t####\t' for c in range(max_name): outstr += '\t'.join([name, '1000']) outstr += '\t' else: print mye f.write(outstr + '\n') f.close()
def write_ds_names(dsfile, mye2name2freq, nameoutfile, max_name=10, fb_ent2name=None): (e2types, t2ents, e2freq) = load_dataset(dsfile) f = open(nameoutfile, 'w') for mye in e2types: outstr = '\t'.join( [mye, e2types[mye][0], ' '.join(e2types[mye]), str(e2freq[mye])]) if mye in mye2name2freq: name2freq = mye2name2freq[mye] sorted_by_freq = sorted(name2freq.items(), key=operator.itemgetter(1), reverse=True) # sorted_by_freq = sorted(name2freq, key=name2freq.get, reverse=True) c = 0 outstr += '\t####\t' for name, freq in sorted_by_freq: outstr += '\t'.join([name, str(freq)]) outstr += '\t' if c == max_name: break c += 1 elif fb_ent2name and mye in fb_ent2name: outstr += '\t####\t' for c in range(max_name): outstr += '\t'.join([name, '1000']) outstr += '\t' else: print mye f.write(outstr + '\n') f.close()
def extend_dstypes_to_parents(fig2parents, entdsdir, sampledlines_dir, outdir, edsname, linesname): e2types, t2ents = load_dataset(entdsdir + edsname, logger) e2lines, t2lines_train, e2freq = load_lines(sampledlines_dir + linesname, e2types, -1) e2types_extended = extend_write_eds(fig2parents, e2types, outdir + edsname, e2freq) writelines(e2lines,outdir + linesname, e2types_extended)
f = open(outdir + '/ds_lines_figer', 'w') logger.info('write test lines in %s', outdir + 'ds_lines_figer') for myline in lines: f.write(myline.strip()) f.write('\n') f.close() if __name__ == '__main__': fbname2figer = laod_figermapping('/nfs/data3/yadollah/nlptools_resources/figer/config/types.map') fig2parents = load_parents('/nfs/data3/yadollah/nlptools_resources/figer/config/yy_type2parents') mid2name = load_type_names('/nfs/data1/proj/yadollah/cluewebwork/nlu/dataForImport/type.name') logger.info('size types with names: %d', len(mid2name)) dsfile = sys.argv[1] ds_linesfile = sys.argv[2] outdir = sys.argv[3] (e2types, t2ents, e2freq) = load_dataset(dsfile, logger) logger.info(len(e2types)) newConvertedDs = filter_write_ds(e2types, fbname2figer, mid2name, outdir, fig2parents) logger.info('size of dataset after filtering to figer type: %d', len(newConvertedDs)) (new_dslines, e2freq) = filter_ds_lines(ds_linesfile, newConvertedDs, outdir) logger.info('#lines after filtering to figer type: %d', len(new_dslines)) write_ds(newConvertedDs, e2freq, outdir + 'Eds_figer') write_lines(new_dslines, e2freq, outdir)