Пример #1
0
def split_long_short(file_paths, prefix='default', threshhold=20):
  data = []
  for file in file_paths:
    data.append(utils.read_amr_format(file))

  short_data, long_data = [], []

  for d in data:
    if (d['text'].split(' ')) > threshhold:
      long_data.append(d)
    else:
      short_data.append(d)
  utils.save_amr_format(short_data, 'tmp/%s.short_data.amr.txt' % (prefix))
  utils.save_amr_format(long_data, 'tmp/%s.long_data.amr.txt' % (prefix))
Пример #2
0
def wordsense_observation(path):
  data = []
  for fname in os.listdir(path):
    print('Read file: %s'%(fname))
    x= utils.read_amr_format(join(path, fname))
    print(type(x))
    data += x
  pool = Pool(20)
  counter = Counter()
  result = pool.map(analyze, data)
  for c in result:
    counter.update(c)

  word_counter = Counter()
  for 
Пример #3
0
def wordsense_observation(path):
    data = []
    for fname in os.listdir(path):
        print('Read file: %s' % (fname))
        x = utils.read_amr_format(join(path, fname))
        # print(type(x))
        data += x
    pool = Pool(20)
    counter = Counter()
    result = pool.map(analyze, data)
    for c in result:
        counter.update(c)

    word_counter = Counter()
    sense_counter = Counter()
    print(type(counter))

    for sense in counter:
        freq = counter[sense]
        word, is_sense = split_sense(sense)
        if is_sense:
            word_counter.update({word: freq})
            sense_counter.update({sense: freq})
    # print top
    # print_top(word_counter, 20)
    # print('------')
    # print_top(sense_counter,20)

    print('Number of sense: %d' % (len(sense_counter)))
    print('Number of word: %d' % (len(word_counter)))
    print('Sense/word: %f' % (float(len(sense_counter)) / len(word_counter)))

    mul_sense_counter = dict()
    for sense in sense_counter:
        word, _ = split_sense(sense)
        if word not in mul_sense_counter:
            mul_sense_counter[word] = set([sense])
        else:
            sense_set = mul_sense_counter[word]
            sense_set.add(sense)
            mul_sense_counter[word] = sense_set
    x = [
        word for word, sense_set in mul_sense_counter.items()
        if len(sense_set) > 1
    ]
    print('Multiple-sense word: %d' % (len(x)))
    print('   percentage: %f' % (float(len(x)) / len(word_counter)))
Пример #4
0
parser.add_argument('--smatch',
                    default='/home/vietld/jaist/fairseq/smatch',
                    action='store_true',
                    help='SMATCH root directory')

parser.add_argument('-i', '--input', required=True, help='Input file path')
parser.add_argument('-o', '--output', help='Output file path')

args = parser.parse_args()

if args.linearize:
    p = Pool(20)
    print('Linearize file: %s' % (args.input))
    filename = basename(args.input)
    directory = args.input[:-len(filename)]
    data = read_amr_format(args.input, return_dict=False)
    sentences = [x['snt'] for x in data]
    amrs = [x['doc'] for x in data]

    amrs_linearized = []
    for x in data:
        try:
            amrs_linearized.append(linearize(x['doc']))
        except:
            print('Error at linearizing: ' + x['id'])
    prefix = filename.split('.')[0]
    save(sentences, join(directory, '%s.snt' % (args.output)))
    save(amrs_linearized, join(directory, '%s.amr' % (args.output)))

elif args.delinearize:
    p = Pool(20)
Пример #5
0
if __name__ == '__main__':
    CORPUS = 'corpus/'
    OUTPUT = 'data/LDC2014.snt-amr/'
    OUTPUT = 'data/civilcode.snt-amr/'
    data_file = [
        'amr-release-1.0-bolt.txt',
        'amr-release-1.0-consensus.txt',
        'amr-release-1.0-dfa.txt',
        'amr-release-1.0-mt09sdl.txt',
        'amr-release-1.0-proxy.txt',
        'amr-release-1.0-xinhua.txt',
    ]
    data_file = ['civilcode-1.0.txt']
    data = []
    for fname in data_file:
        data += read_amr_format(join(CORPUS, fname))

    p = Pool(20)
    result = p.map(preprocess, data)
    # train, valid, test = split(result)
    # save([u for u, l, a in train], join(OUTPUT, 'train.snt'))
    # save([l for u, l, a in train], join(OUTPUT, 'train.amr'))
    # save([u for u, l, a in valid], join(OUTPUT, 'valid.snt'))
    # save([l for u, l, a in valid], join(OUTPUT, 'valid.amr'))
    # save([u for u, l, a in test], join(OUTPUT, 'test.snt'))
    # save([l for u, l, a in test], join(OUTPUT, 'test.amr'))

    save([u for u, l, a in result], join(OUTPUT, 'amr-test.snt'))
    save([l for u, l, a in result], join(OUTPUT, 'amr-test.amr'))

    #save_amr_format([a for u, l, a in test], join(OUTPUT, 'test.amr.txt'),end='')