def main(): print("Script to compare annotations") tag_isf = read_data(TAG_ISF) tag_human = read_data(TAG_HUMAN) tag_match = [] tag_missing = [] tag_missing_by_lemma = [] for tagh in tag_human: found = None for tagi in tag_isf: if tagi[:-2] == tagh[:-2] and same_synsetid(tagi[-2], tagh[-2]): tag_match.append(tagh) found = tagi break if found is None: tag_missing.append(tagh) c = Counter() for missing in tag_missing: c.count(missing[-1]) c.summarise() for k in c.count_map: tag_missing_by_lemma.append((k, c[k])) tag_missing_by_lemma.sort(key=itemgetter(1)) print("Match: %s" % (len(tag_match),)) writelines([ '\t'.join(x) for x in tag_match ], TAG_MATCH) writelines([ '\t'.join(x) for x in tag_missing ], TAG_MISSING) writelines([ '\t'.join([str(c) for c in x]) for x in reversed(tag_missing_by_lemma) ], TAG_MISSING_BY_LEMMA) print("Done!") pass
def main(): print("Script to convert Viet SentiWordnet to Open Multilingual Wordnet format") c = Counter() # Fix VSW format with open(VSW_DATA, 'r') as vsw_input: with open(VSW_FIXED, 'w') as vsw_fixed: for line in vsw_input.readlines(): if line.startswith('#'): vsw_fixed.write(line) if line.startswith('# Web: https://sourceforge.net/projects/vietsentiwordne/'): vsw_fixed.write('#\n# Some bugs fixed by Le Tuan Anh <*****@*****.**>\n') vsw_fixed.write('# Latest version is available at: https://github.com/letuananh/omwtk\n#\n') else: c.count('processed') sense = Sense(*line.split('\t')) if sense.Gloss.find(';') < 0: c.count("error") #print(sense.Gloss.strip()) fixedline = line if line.find(', "') > 0: fixedline = line.replace(', "', '; "', 1) elif line.find('"') < 0: #print(line) c.count("No example") elif line.find(',"') > 0: fixedline = line.replace(',"', '; "', 1) elif line.find('như: "') > 0: fixedline = line.replace('như: "', '; "', 1) vsw_fixed.write(fixedline) else: c.count("ok") vsw_fixed.write(line) c.summarise() #exit() # Read file with open(VSW_FIXED, 'r') as vsw_input: lines = [ x for x in vsw_input.readlines() if not x.startswith('#') ] senses = [ Sense(*line.split('\t')) for line in lines ] # Write file with open(OMW_DATA, 'w') as omw_output: omw_output.write('# Prepared by Le Tuan Anh <*****@*****.**>\n') omw_output.write('# Based on Viet SentiWordnet 1.0\n') omw_output.write('# Latest version is available at: https://github.com/letuananh/omwtk\n') all_examples = [] all_definitions = [] for sense in senses: # 001937986-a vie:lemma giỏ # 001937986-a vie:def có trình độ cao, đáng được khâm phục, khen ngợi # 001937986-a vie:exe giáo viên dạy giỏi synset_id = '%s-%s' % (sense.SenseID , sense.POS) lemma = sense.SynsetTerms.split('#')[0] # Some lemmas are wrong #if len(sense.SynsetTerms.split('#')) > 2: # print(sense.SynsetTerms) definition = '' example = '' if sense.Gloss.find(';') > 0: definition = sense.Gloss[:sense.Gloss.find(';')].strip() example = sense.Gloss[sense.Gloss.find(';')+1:].strip() omw_output.write('%s\tvie:lemma\t%s\n' % (synset_id,lemma)) if definition: omw_output.write('%s\tvie:def\t%s\n' % (synset_id,definition)) all_definitions.append(definition) if example: examples = [ x.strip() for x in example.split('"') if len(x.strip()) > 1 ] all_examples += examples #print(examples) for i, val in enumerate(examples): omw_output.write('%s\tvie:exe\t%s\t%s\n' % (synset_id, i, val)) all_examples.sort(key=len) for example in all_examples: print(example) writelines(all_examples, 'data/examples.txt') writelines(sorted(all_definitions,key=len), 'data/defs.txt') pass