def bag_of_words(defs, stem_flag, quiet=False): '''convert dictionary definitions into bags of words''' # convert to bag of words, count words if not quiet: print "Converting defs to bags of words" count = {} pr = progressbar.ProgressBar(len(defs), quiet) empty_keys = set() for lemma in defs: pr.advance() defs[lemma] = [ tesslang.standardize('any', w) for w in pat.clean['any'].split(defs[lemma]) if not w.isspace() and w != '' ] if len(defs[lemma]) > 0: for d in defs[lemma]: if d in count: count[d] += 1 else: count[d] = 1 else: empty_keys.add(lemma) if not quiet: print "Removing hapax legomena" pr = progressbar.ProgressBar(len(defs), quiet) for lemma in defs: pr.advance() defs[lemma] = [w for w in defs[lemma] if count[w] > 1] if defs[lemma] == []: empty_keys.add(lemma) if not quiet: print 'Lost {0} empty definitions'.format(len(empty_keys)) for k in empty_keys: del defs[k] return (defs)
def export_dict(defs, name, quiet): '''export definitions as a text file''' dir_export = os.path.join(fs['data'], 'synonymy', 'dict-diagnostic') shutil.rmtree(dir_export) os.mkdir(dir_export) if not quiet: print 'Exporting plain-text definitions to {0}'.format(dir_export) keychar = None f = None pr = progressbar.ProgressBar(len(defs), quiet) for head in defs: if len(head) < 1: continue if head[0] != keychar: keychar = head[0] file_export = os.path.join(dir_export, keychar) f = open(file_export, 'a') f.write('{0}::{1}\n'.format(head.encode('utf8'), defs[head].encode('utf8'))) pr.advance() f.close()
def parse_stop_list(lang, name, quiet): '''read frequency table''' # open stoplist file filename = None if name == '*': filename = os.path.join(fs['data'], 'common', lang + '.stem.freq') else: filename = os.path.join(fs['data'], 'v3', lang, name, name + '.freq_stop_stem') if not quiet: print 'Reading stoplist {0}'.format(filename) pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet) try: f = codecs.open(filename, encoding='utf_8') except IOError as err: print "Can't read {0}: {1}".format(filename, str(err)) sys.exit(1) # read stoplist header to get total token count head = f.readline() m = re.compile('#\s+count:\s+(\d+)', re.U).match(head) if m is None: print "Can't find header in {0}".format(filename) sys.exit(1) total = int(m.group(1)) pr.advance(len(head.encode('utf-8'))) # read the individual token counts, divide by total rank = {} n = 1 for line in f: lemma, count = line.split('\t') lemma = tesslang.standardize(lang, lemma) lemma = number.sub('', lemma) rank[lemma] = math.log(n) n += 1 pr.advance(len(line.encode('utf-8'))) return (rank)
def parse_stem_dict(lang, quiet): '''parse the csv stem dictionaries of Helma Dik''' filename = os.path.join(fs['data'], 'common', lang + '.lexicon.csv') f = open(filename, 'r') if not quiet: print 'Reading lexicon {0}'.format(filename) pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet) try: f = codecs.open(filename, encoding='utf_8') except IOError as err: print "Can't read {0}: {1}".format(filename, str(err)) sys.exit(1) pos = dict() heads = dict() for line in f: pr.advance(len(line.encode('utf-8'))) line = line.strip().lower().replace('"', '') try: token, code, lemma = line.split(',') except ValueError: continue lemma = tesslang.standardize(lang, lemma) lemma = pat.number.sub('', lemma) if len(code) == 10: if lemma in pos: pos[lemma].append(code[:2]) else: pos[lemma] = [code[:2]] heads[lemma] = 1 success = 0 for lemma in heads: if lemma in pos: success += 1 print 'pos success; {0}%'.format(100 * success / len(heads)) return(pos)
def build_corpus(defs, quiet=False): '''Create a "corpus" of the type expected by Gensim''' if not quiet: print 'Generating Gensim-style corpus' pr = progressbar.ProgressBar(len(defs), quiet) corpus = [] for lemma in defs: pr.advance() corpus.append(defs[lemma]) return (corpus)
def make_index(defs, quiet=False): '''Create two look-up tables: one by id and one by headword''' if not quiet: print 'Creating indices' by_word = {} by_id = [] pr = progressbar.ProgressBar(len(defs), 1) for lemma in defs: pr.advance() by_id.append(lemma) by_word[lemma] = len(by_id) - 1 return (by_word, by_id)
def make_index(defs, quiet): '''Create two look-up tables: one by id and one by headword''' if not quiet: print 'Creating indices' by_word = {} by_id = [] pr = progressbar.ProgressBar(len(defs), 1) for lemma in defs: pr.advance() by_id.append(lemma) by_word[lemma] = len(by_id) - 1 # save the lookup table file_lookup_word = os.path.join(fs['data'], 'synonymy', 'lookup_word.pickle') if not quiet: print 'Saving index ' + file_lookup_word f = open(file_lookup_word, "w") pickle.dump(by_word, f) f.close() # save the id lookup file_lookup_id = os.path.join(fs['data'], 'synonymy', 'lookup_id.pickle') if not quiet: print 'Saving index ' + file_lookup_id f = open(file_lookup_id, "w") pickle.dump(by_id, f) f.close()
def main(): # # check for options # parser = argparse.ArgumentParser( description='Query the headword similarities matrix') parser.add_argument('-q', '--query', metavar='LANG', type=str, choices=["greek", "latin"], default="greek", help = 'Language to translate from') parser.add_argument('-c', '--corpus', metavar='LANG', type=str, choices=["greek", "latin"], default="latin", help = 'Language to translate to') parser.add_argument('-o', '--output', metavar='FILE', type=str, default="trans.csv", help = 'Destination file') parser.add_argument('-t', '--topics', metavar='N', type=int, default=0, help = 'Reduce to N topics using LSI; 0=disabled') parser.add_argument('-r', '--results', metavar="N", type=int, default=2, help = 'Max number of results to produce for each query') parser.add_argument('-w', '--weight', metavar="F", type=float, default=0, help = 'Weight scores by inverse log-rank, coefficient F.' + ' Suggested range 0-1. Default is no weighting') parser.add_argument('--child', metavar="I:N", type=validate_arg_child, default = None, help = "This is child I of N, only do part of the work") parser.add_argument('--quiet', action='store_const', const=1, help = "Don't print status messages to stderr") opt = parser.parse_args() # # load data created by read_lexicon.py # # the index by word by_word = load_dict('lookup_word.json', opt.quiet) # the index by id by_id = np.array(load_dict('lookup_id.json', opt.quiet)) # the corpus corpus = load_dict("defs_bow.json", opt.quiet) # # use gensim to calculate similarities # # NOTE: When you call similarities.Similarity with a value for the number # of similarities to calculate, the results are really different from what # you get if you leave that parameter out (i.e. calculate for all # documents): with no number of sims, you get back a numpy array with as # many elements as there are documents, in an order corresponding to the # order of the documents in the corpus, where each element is the similarity # for the document in that position; if you specify a number of similarities # to return, then you get back a list of tuples, each tuple contains the # position of a document in the corpus and that document's similarity score. # These appear to be always in order of decreasing score. # # Older versions of this script expected the list of tuples, but didn't # assume any order and re-ordered them by score. Now I've changed it to # expect the numpy array. Note that thanks to numpy you can use the array # like a vector in R. For example, # sims = sims[filter] # subsets the sims array using another array, this time of boolean values. # Likewise, # sims -= np.absolute((rank[q_id] - rank[filter]) * opt.weight) # Subtracts from every element of sims the difference between one specific # rank, that of document q_id, and each element of array rank in turn. # The arrays rank and sims, each subset by filter, have the same number of # elements. # # I'm just writing this note to myself because I'm really new at this numpy # stuff and I might forget what I've done here otherwise. Delete this if # you like, later. # create dictionary if not opt.quiet: print 'Creating dictionary' dictionary = corpora.Dictionary(corpus) # convert each sample to a bag of words if not opt.quiet: print 'Converting each doc to bag-of-words' corpus = [dictionary.doc2bow(doc) for doc in corpus] # calculate tf-idf scores if not opt.quiet: print 'Creating tf-idf model' tfidf = models.TfidfModel(corpus) if not opt.quiet: print 'Transforming the corpus to tf-idf' corpus_tfidf = tfidf[corpus] # perform lsi transformation corpus_final = corpus_tfidf if opt.topics > 0: if not opt.quiet: print 'Performing LSI with {0} topics'.format(opt.topics) lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=opt.topics) corpus_final = lsi[corpus_tfidf] # calculate similarities if not opt.quiet: print 'Calculating similarities (please be patient)' dir_calc = os.path.join(fs['data'], 'synonymy', 'sims') index = similarities.Similarity(dir_calc, corpus_final, len(corpus_final)) # consider frequency distribution rank = load_ranks(by_id, opt.quiet) # determine translation candidates, write output file_out = codecs.open(opt.output, "w", encoding="utf_8") if not opt.quiet: print 'Writing translation candidates to {0}'.format(opt.output) # optional filter by language filter = np.array([r is not None for r in rank]) if (opt.corpus == "latin"): filter = filter & np.invert(np.array([is_greek(lem) for lem in by_id])) elif (opt.corpus == "greek"): filter = filter & np.array([is_greek(lem) for lem in by_id]) # take each headword in turn as a query pr = progressbar.ProgressBar(len(by_word), opt.quiet) results = [] for q_id, sims in enumerate(index): pr.advance() q = by_id[q_id] if opt.query == "greek" and not is_greek(q): continue if opt.query == "latin" and is_greek(q): continue if rank[q_id] is None: continue # if child, only do every ith query if opt.child is not None: child_id, nchildren = opt.child if q_id % nchildren != child_id % nchildren: continue # add query word to filter filter[q_id] = False # apply filter sims = sims[filter] # apply distribution difference metric sims -= np.absolute((rank[q_id] - rank[filter]) * opt.weight) # add result words and sort by score sims = zip(by_id[np.arange(len(by_id))][filter], sims) sims = sorted(sims, key=lambda res: res[1], reverse=True) results = [u"{0}:{1}".format(res, sim) for res, sim in sims[:opt.results]] file_out.write(u"{0},".format(q)) file_out.write(u",".join(results)) file_out.write(u"\n") file_out.close()
def parse_XML_dictionaries(langs, quiet=False): '''Create a dictionary of english translations for each lemma''' defs = dict() # process latin, greek lexica in turn for lang in langs: filename = os.path.join(fs['data'], 'common', lang + '.lexicon.xml') if not quiet: print 'Reading lexicon {0}'.format(filename) pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet) try: f = codecs.open(filename, encoding='utf_8') except IOError as err: print "Can't read {0}: {1}".format(filename, str(err)) sys.exit(1) # # Each line in the lexicon is one entry. # Process one at a time to extract headword, definition. # for line in f: pr.advance(len(line.encode('utf-8'))) # skip lines that don't conform with the expected entry structure m = pat.entry.search(line) if m is None: continue lemma, entry = m.group(1, 2) # standardize the headword lemma = pat.clean[lang].sub('', lemma) lemma = pat.number.sub('', lemma) lemma = tesslang.standardize(lang, lemma) # remove elements on the stoplist for stop in pat.stop: entry = stop.sub('', entry) # transliterate betacode to unicode chars # in foreign tags entry = pat.foreign.sub(mo_beta2uni, entry) # extract strings marked as translations of the headword def_strings = pat.definition[lang].findall(entry) # drop empty defs def_strings = [d for d in def_strings if not d.isspace()] # skip lemmata for which no translation can be extracted if def_strings is None: continue if lemma in defs and defs[lemma] is not None: defs[lemma].extend(def_strings) else: defs[lemma] = def_strings if not quiet: print 'Read {0} entries'.format(len(defs)) print 'Flattening entries with multiple definitions' pr = progressbar.ProgressBar(len(defs), quiet) empty_keys = set() for lemma in defs: pr.advance() if defs[lemma] is None or defs[lemma] == []: empty_keys.add(lemma) continue defs[lemma] = '; '.join(defs[lemma]) if not quiet: print 'Lost {0} empty definitions'.format(len(empty_keys)) for k in empty_keys: del defs[k] if "" in defs: del defs[""] return (defs)
def main(): # # check for options # parser = argparse.ArgumentParser( description='Query the headword similarities matrix') parser.add_argument('-n', '--results', metavar='N', default=2, type=int, help='Display top N results') parser.add_argument( '-t', '--translate', metavar='MODE', default=1, type=int, help='Translation mode: 1=Greek to Latin; 2=Latin to Greek') parser.add_argument('-l', '--lsi', action='store_const', const=1, help='Use LSI to reduce dimensionality') parser.add_argument('-f', '--feature', metavar="FEAT", default='trans2', type=str, help='Name of feature dictionary to create') parser.add_argument('-c', '--cutoff', metavar='C', default=None, type=float, help='Similarity threshold for synonymy (range: 0-1)') parser.add_argument('-w', '--weighted', action='store_const', const=1, help='Weight results by rank difference from query') parser.add_argument('--scores', action='store_const', const=1, help='Export scores along with translations') parser.add_argument('-q', '--quiet', action='store_const', const=1, help="Don't print status messages to stderr") opt = parser.parse_args() if opt.translate not in [1, 2]: opt.translate = 0 # # load data created by read_lexicon.py # # the text-only defs # global full_def # # full_def = load_dict('full_defs.pickle', opt.quiet) # the index by word global by_word by_word = load_dict('lookup_word.pickle', opt.quiet) # the index by id global by_id by_id = load_dict('lookup_id.pickle', opt.quiet) # the corpus global corpus if opt.lsi is None: file_corpus = os.path.join(fs['data'], 'synonymy', 'gensim.corpus_tfidf.mm') else: file_corpus = os.path.join(fs['data'], 'synonymy', 'gensim.corpus_lsi.mm') if not opt.quiet: print 'Loading corpus ' + file_corpus corpus = corpora.MmCorpus(file_corpus) # the similarities index global index file_index = os.path.join(fs['data'], 'synonymy', 'gensim.index') if not opt.quiet: print 'Loading similarity index ' + file_index index = similarities.Similarity.load(file_index) # optional: consider frequency distribution global rank if opt.weighted == 1: rank = dict(parse_stop_list('la', '*', opt.quiet), **parse_stop_list('grc', '*', opt.quiet)) # # determine translation candidates, write output # if not opt.quiet: print 'Exporting dictionary' filename_csv = os.path.join(fs['data'], 'synonymy', opt.feature + '.csv') file_output = codecs.open(filename_csv, 'w', encoding='utf_8') pr = progressbar.ProgressBar(len(by_word), opt.quiet) # take each headword in turn as a query for q in by_word: pr.advance() if opt.translate and (is_greek(q) == opt.translate - 1): continue if (q not in by_word): continue # query the similarity matrix sims = get_results(q) # filter out query word, query language sims = filter_results(sims, q, opt.translate) # optional: apply distribution difference metric if opt.weighted == 1: sims = apply_freq_diff(sims, q) # keep only the best results, top n or above cutoff sims = cull(sims, opt.results, opt.cutoff) # print row export_row(file_output, q, sims, opt.scores)