def split_long_short(file_paths, prefix='default', threshhold=20): data = [] for file in file_paths: data.append(utils.read_amr_format(file)) short_data, long_data = [], [] for d in data: if (d['text'].split(' ')) > threshhold: long_data.append(d) else: short_data.append(d) utils.save_amr_format(short_data, 'tmp/%s.short_data.amr.txt' % (prefix)) utils.save_amr_format(long_data, 'tmp/%s.long_data.amr.txt' % (prefix))
def wordsense_observation(path): data = [] for fname in os.listdir(path): print('Read file: %s'%(fname)) x= utils.read_amr_format(join(path, fname)) print(type(x)) data += x pool = Pool(20) counter = Counter() result = pool.map(analyze, data) for c in result: counter.update(c) word_counter = Counter() for
def wordsense_observation(path): data = [] for fname in os.listdir(path): print('Read file: %s' % (fname)) x = utils.read_amr_format(join(path, fname)) # print(type(x)) data += x pool = Pool(20) counter = Counter() result = pool.map(analyze, data) for c in result: counter.update(c) word_counter = Counter() sense_counter = Counter() print(type(counter)) for sense in counter: freq = counter[sense] word, is_sense = split_sense(sense) if is_sense: word_counter.update({word: freq}) sense_counter.update({sense: freq}) # print top # print_top(word_counter, 20) # print('------') # print_top(sense_counter,20) print('Number of sense: %d' % (len(sense_counter))) print('Number of word: %d' % (len(word_counter))) print('Sense/word: %f' % (float(len(sense_counter)) / len(word_counter))) mul_sense_counter = dict() for sense in sense_counter: word, _ = split_sense(sense) if word not in mul_sense_counter: mul_sense_counter[word] = set([sense]) else: sense_set = mul_sense_counter[word] sense_set.add(sense) mul_sense_counter[word] = sense_set x = [ word for word, sense_set in mul_sense_counter.items() if len(sense_set) > 1 ] print('Multiple-sense word: %d' % (len(x))) print(' percentage: %f' % (float(len(x)) / len(word_counter)))
parser.add_argument('--smatch', default='/home/vietld/jaist/fairseq/smatch', action='store_true', help='SMATCH root directory') parser.add_argument('-i', '--input', required=True, help='Input file path') parser.add_argument('-o', '--output', help='Output file path') args = parser.parse_args() if args.linearize: p = Pool(20) print('Linearize file: %s' % (args.input)) filename = basename(args.input) directory = args.input[:-len(filename)] data = read_amr_format(args.input, return_dict=False) sentences = [x['snt'] for x in data] amrs = [x['doc'] for x in data] amrs_linearized = [] for x in data: try: amrs_linearized.append(linearize(x['doc'])) except: print('Error at linearizing: ' + x['id']) prefix = filename.split('.')[0] save(sentences, join(directory, '%s.snt' % (args.output))) save(amrs_linearized, join(directory, '%s.amr' % (args.output))) elif args.delinearize: p = Pool(20)
if __name__ == '__main__': CORPUS = 'corpus/' OUTPUT = 'data/LDC2014.snt-amr/' OUTPUT = 'data/civilcode.snt-amr/' data_file = [ 'amr-release-1.0-bolt.txt', 'amr-release-1.0-consensus.txt', 'amr-release-1.0-dfa.txt', 'amr-release-1.0-mt09sdl.txt', 'amr-release-1.0-proxy.txt', 'amr-release-1.0-xinhua.txt', ] data_file = ['civilcode-1.0.txt'] data = [] for fname in data_file: data += read_amr_format(join(CORPUS, fname)) p = Pool(20) result = p.map(preprocess, data) # train, valid, test = split(result) # save([u for u, l, a in train], join(OUTPUT, 'train.snt')) # save([l for u, l, a in train], join(OUTPUT, 'train.amr')) # save([u for u, l, a in valid], join(OUTPUT, 'valid.snt')) # save([l for u, l, a in valid], join(OUTPUT, 'valid.amr')) # save([u for u, l, a in test], join(OUTPUT, 'test.snt')) # save([l for u, l, a in test], join(OUTPUT, 'test.amr')) save([u for u, l, a in result], join(OUTPUT, 'amr-test.snt')) save([l for u, l, a in result], join(OUTPUT, 'amr-test.amr')) #save_amr_format([a for u, l, a in test], join(OUTPUT, 'test.amr.txt'),end='')