parser = argparse.ArgumentParser(description='TODO') parser.add_argument('matrix', type=str, help='File containing U and S matrix in npz format') parser.add_argument('dictionary', type=str, help='File with saved dictionary') parser.add_argument( 'target_words', type=str, help='File containing targets word, each word on one line') parser.add_argument('output_file', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.matrix) check_input_file_exists(args.dictionary) check_input_file_exists(args.target_words) dictionary = Dictionary(filename=args.dictionary) words = [] with open(args.target_words) as f: for line in f: words.append(line.strip()) data = load_mat(args.matrix) model = SVDModel(data['U'], data['s'], dictionary, caron_p=0.15,
#!/usr/bin/env python3 from synonyms.dictionary import Dictionary from synonyms.in_out.readers import open_gz __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists if __name__ == '__main__': parser = argparse.ArgumentParser(description="TODO") parser.add_argument('input_file', type=str, help='Input file with corpus in plain text') parser.add_argument('dictionary', type=str, help='Input file with dictionary') parser.add_argument('output_file', type=str, help='Output file where filtered version of corpus will be stored') args = parser.parse_args() check_input_file_exists(args.input_file) check_input_file_exists(args.dictionary) dictionary = Dictionary(filename=args.dictionary) with open_gz(args.output_file, 'w+', encoding='utf-8') as w, open_gz(args.input_file, encoding='utf-8') as r: for line in r: w.write(' '.join([word for word in line.lower().split() if word in dictionary])+'\n')
from synonyms.dictionary import Dictionary from synonyms.evaluation.test import Test from synonyms.synonyms import SVDModel from io import open from scipy.io import mmread if __name__ == u'__main__': parser = argparse.ArgumentParser(description=u'TODO') parser.add_argument(u'u', type=unicode, help=u'File ') parser.add_argument(u's', type=unicode, help=u'File ') parser.add_argument(u'dictionary', type=unicode, help=u'File with saved dictionary') parser.add_argument(u'tests', type=unicode, help=u'File with saved tests') parser.add_argument(u'output_file', type=unicode, help=u'Name of the output file') parser.add_argument(u'--verbose', action=u'store_true', default=False) args = parser.parse_args() check_input_file_exists(args.s) check_input_file_exists(args.u) check_input_file_exists(args.dictionary) check_input_file_exists(args.tests) dictionary = Dictionary(filename=args.dictionary) tests = Test.load_tests(args.tests) with open(args.u) as uu, open(args.s) as ss: u = mmread(uu) s = mmread(ss) model = SVDModel(u, s, dictionary) with open(args.output_file, u'w') as file: file.write(u'# caron_p dimensions r_precision ndcg\n') for caron_p in [0.1, 0.15, 0.2, 0.25, 0.35, 0.5, 0.7, 1, 1.2, 1.5, 2]: model.caron_p = caron_p
description="Converts syn_v2 corpus in xml format to plain txt format") parser.add_argument('input_file', type=str, help='Input file with syn_v2 corpus in xml format') parser.add_argument( 'output_file', type=str, help='Output file where plain version of corpus will be stored') parser.add_argument('-k', '--keep-punctuation', dest='keep', action='store_true', default=False, help='Use this switch if you want to keep punctuation') parser.add_argument('-t', '--keep-tags', dest='keep_tags', action='store_true', default=False, help='Use this switch if you want to keep tags') parser.add_argument( '-r', '--raw', dest='raw', action='store_true', default=False, help='Use this switch if you want to keep output raw text') args = parser.parse_args() check_input_file_exists(args.input_file) syn2_to_plain(args.input_file, args.output_file, args.keep, args.keep_tags, args.raw)
import sys if __name__ == '__main__': parser = argparse.ArgumentParser(description='TODO') parser.add_argument('matrix', type=str, help='File containing U and S matrix in npz format') parser.add_argument('dictionary', type=str, help='File with saved dictionary') parser.add_argument('output_file', type=str, help='Name of the output file') parser.add_argument('--verbose', action='store_true', default=False) args = parser.parse_args() check_input_file_exists(args.matrix) check_input_file_exists(args.dictionary) dictionary = Dictionary(filename=args.dictionary) # with open(args.u) as uu, open(args.s) as ss: # u = mmread(args.u) # s = mmread(args.s) data = load_mat(args.matrix) model = SVDModel(data['U'], data['s'], dictionary) with open(args.output_file, 'w') as file: model.caron_p = 0.25 model.dimensions = 400 count = 0 for word in dictionary.all_words(): print("\r %d" % count, end='') synonyms, score = model.get_synonyms(word,
return synonyms_map_cutoff if __name__ == '__main__': parser = argparse.ArgumentParser(description='TODO') parser.add_argument('synonyms', type=str, help='File ') parser.add_argument('stop_list', type=str, help='File ') parser.add_argument('dictionary', type=str, help='File with saved dictionary') parser.add_argument('output_file', type=str, help='Name of the output file') parser.add_argument('--verbose', action='store_true', default=False) args = parser.parse_args() check_input_file_exists(args.synonyms) check_input_file_exists(args.stop_list) check_input_file_exists(args.dictionary) dictionary = Dictionary(filename=args.dictionary) dictionary.size() dictionary.word2count = { strip_word(word): count for word, count in dictionary.word2count.items() } synonyms_map = {} with open(args.synonyms) as file: for line in file: # print(line.split(' : ')) target, synonyms = line.strip().split(' : ', 1) synonyms = synonyms.strip().split(' ') synonyms_map[target] = synonyms
import argparse from synonyms.in_out.utils import check_input_file_exists, load_mat from synonyms.dictionary import Dictionary from synonyms.evaluation.test import Test from synonyms.synonyms import SVDModel, SVDToOneModel if __name__ == '__main__': parser = argparse.ArgumentParser(description='TODO') parser.add_argument('matrix', type=str, help='File containing U and S matrix in npz format') parser.add_argument('dictionary', type=str, help='File with saved dictionary') parser.add_argument('tests', type=str, help='File with saved tests') parser.add_argument('output_file', type=str, help='Name of the output file') parser.add_argument('model', choices=['Normal', 'ToOne'], help='Context type') parser.add_argument('--verbose', action='store_true', default=False) args = parser.parse_args() check_input_file_exists(args.matrix) check_input_file_exists(args.dictionary) check_input_file_exists(args.tests) dictionary = Dictionary(filename=args.dictionary) tests = Test.load_tests(args.tests) data = load_mat(args.matrix) if args.model == 'ToOne': model = SVDToOneModel(data['U'], data['s'], dictionary) else: model = SVDModel(data['U'], data['s'], dictionary) with open(args.output_file, 'w') as file: if args.model == 'ToOne': file.write('# to_one dimensions r_precision ndcg\n') for to_one in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: model.to_one = to_one
#!/usr/bin/env python3 from synonyms.in_out.readers import open_gz __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes counts and ppmi matrix for given corpus and dictionary") parser.add_argument('corpus', type=str, help='Corpus') parser.add_argument('word_count', type=int, help='Word count') parser.add_argument('postfix_length', type=int) parser.add_argument('output_file', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.corpus) max_count = args.word_count with open_gz(args.corpus) as input: word_count = 0 file_count = 0 line_1 = None end_of_file = False output = None for line in input: line = line.strip() if not output: output = open_gz(args.output_file + ('.%0' + str(args.postfix_length) + 'd') % file_count + '.gz', 'w') if line == '\n': output.write('\n') continue
if synonym not in synonyms_cutoff and synonym != target: synonyms_cutoff.append(synonym) if len(synonyms_cutoff) != 0: synonyms_map_cutoff[target] = synonyms_cutoff return synonyms_map_cutoff if __name__ == '__main__': parser = argparse.ArgumentParser(description='TODO') parser.add_argument('synonyms', type=str, help='File ') parser.add_argument('stop_list', type=str, help='File ') parser.add_argument('dictionary', type=str, help='File with saved dictionary') parser.add_argument('output_file', type=str, help='Name of the output file') parser.add_argument('--verbose', action='store_true', default=False) args = parser.parse_args() check_input_file_exists(args.synonyms) check_input_file_exists(args.stop_list) check_input_file_exists(args.dictionary) dictionary = Dictionary(filename=args.dictionary) dictionary.size() dictionary.word2count = {strip_word(word): count for word, count in dictionary.word2count.items()} synonyms_map = {} with open(args.synonyms) as file: for line in file: # print(line.split(' : ')) target, synonyms = line.strip().split(' : ', 1) synonyms = synonyms.strip().split(' ') synonyms_map[target] = synonyms stop_list = set() with open(args.stop_list) as file: for line in file:
choices=range(1, 4), help='Context size') parser.add_argument( 'context_type', choices=[CONTEXT_TYPE.L, CONTEXT_TYPE.R, CONTEXT_TYPE.LR], help='Context type') parser.add_argument('output_file', type=str, help='Name of the output file') parser.add_argument('corpuses', type=str, nargs='+', help='Corpus in plain format') args = parser.parse_args() for corpus in args.corpuses: check_input_file_exists(corpus) check_input_file_exists(args.dictionary) dictionary = Dictionary(filename=args.dictionary) regex = re.compile(".*\.([0-9]*)\..*") for corpus in args.corpuses: number = regex.findall(corpus)[0] print('Processing:' + corpus) new_filename = args.output_file + '.' + number new_dict = dictionary.create_reseted_copy(corpus) counts_matrix = create_counts(corpus, new_dict, args.context_size, args.context_type) print(counts_matrix.shape) new_dict.save(new_filename + '.dict') save_mat(counts_matrix, new_filename + '.counts')
#!/usr/bin/env python3 __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists from synonyms.corpus.utils import syn2_to_plain if __name__ == '__main__': parser = argparse.ArgumentParser(description="Converts syn_v2 corpus in xml format to plain txt format") parser.add_argument('input_file', type=str, help='Input file with syn_v2 corpus in xml format') parser.add_argument('output_file', type=str, help='Output file where plain version of corpus will be stored') parser.add_argument('-k', '--keep-punctuation', dest='keep', action='store_true', default=False, help='Use this switch if you want to keep punctuation') parser.add_argument('-t', '--keep-tags', dest='keep_tags', action='store_true', default=False, help='Use this switch if you want to keep tags') parser.add_argument('-r', '--raw', dest='raw', action='store_true', default=False, help='Use this switch if you want to keep output raw text') args = parser.parse_args() check_input_file_exists(args.input_file) syn2_to_plain(args.input_file, args.output_file, args.keep, args.keep_tags, args.raw)
#!/usr/bin/env python3 from synonyms.part_of_speech.model import POS __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists if __name__ == '__main__': parser = argparse.ArgumentParser(description="Creates pos model for data") parser.add_argument('data', type=str, help='Data') parser.add_argument('output_file', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.data) pos = POS.create(args.data) pos.save(args.output_file)
#!/usr/bin/env python3 __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists, load_mat from synonyms.dictionary import Dictionary from synonyms.evaluation.test import Test from synonyms.synonyms import SVDModel if __name__ == '__main__': parser = argparse.ArgumentParser(description='TODO') parser.add_argument('matrix', type=str, help='File containing U and S matrix in npz format') parser.add_argument('dictionary', type=str, help='File with saved dictionary') parser.add_argument('target_words', type=str, help='File containing targets word, each word on one line') parser.add_argument('output_file', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.matrix) check_input_file_exists(args.dictionary) check_input_file_exists(args.target_words) dictionary = Dictionary(filename=args.dictionary) words = [] with open(args.target_words) as f: for line in f: words.append(line.strip()) data = load_mat(args.matrix) model = SVDModel(data['U'], data['s'], dictionary, caron_p=0.15, dimensions=2500) i = 1 with open(args.output_file, 'w') as file: for target_word in words:
#!/usr/bin/env python3 __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists, save_mat from synonyms.ppmi import create_counts, transform, CONTEXT_TYPE from synonyms.dictionary import Dictionary if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes counts and ppmi matrix for given corpus and dictionary") parser.add_argument('corpus', type=str, help='Corpus in plain format') parser.add_argument('dictionary', type=str, help='File with saved dictionary') parser.add_argument('context_size', type=int, choices=range(1, 4), help='Context size') parser.add_argument('context_type', choices=[CONTEXT_TYPE.L, CONTEXT_TYPE.R, CONTEXT_TYPE.LR], help='Context type') parser.add_argument('output_file', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.corpus) check_input_file_exists(args.dictionary) dictionary = Dictionary(filename=args.dictionary) counts_matrix = create_counts(args.corpus, dictionary, args.context_size, args.context_type) save_mat(counts_matrix, args.output_file + '.counts') ppmi_matrix = transform(counts_matrix, dictionary) save_mat(ppmi_matrix, args.output_file + '.ppmi')
#!/usr/bin/env python3 __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists from synonyms.evaluation.test import Test if __name__ == '__main__': parser = argparse.ArgumentParser(description='TODO') parser.add_argument('tests1', type=str, help='File with saved tests1') parser.add_argument('tests2', type=str, help='File with saved tests2') parser.add_argument('output_file', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.tests1) check_input_file_exists(args.tests2) tests1 = Test.load_tests(args.tests1) tests2 = Test.load_tests(args.tests2) tests = Test.intersect(tests1, tests2) Test.save_tests(tests,args.output_file)
parser.add_argument('matrix_U', type=str, help='File containing U matrix in npy format') parser.add_argument('matrix_s', type=str, help='File containing S matrix in npy format') parser.add_argument('dictionary', type=str, help='File with saved dictionary') parser.add_argument('tests', type=str, help='File with saved tests') parser.add_argument('output_file', type=str, help='Name of the output file') parser.add_argument('--verbose', action='store_true', default=False) args = parser.parse_args() check_input_file_exists(args.matrix_U) check_input_file_exists(args.matrix_s) check_input_file_exists(args.dictionary) check_input_file_exists(args.tests) dictionary = Dictionary(filename=args.dictionary) tests = Test.load_tests(args.tests) U = np.load(args.matrix_U, mmap_mode='r') s = np.load(args.matrix_s, mmap_mode='r') model = SVDModelMemMap(U, s, dictionary) print('model loaded') #with open(args.output_file, 'w') as file: # file.write('# caron_p dimensions r_precision ndcg\n') # for caron_p in [0.15, 0.25, 0.35, 0.5, 0.7, 1, 1.2, 1.5, 2]: # model.caron_p = caron_p # for dimensions in [2000, 1000, 500, 200, 100]: #[10000, 8000, 6000, 4000, 2500, 2000, 1000, 500, 200, 100]:
#!/usr/bin/env python3 __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists, save_mat, load_mat from synonyms.ppmi import create_counts, transform, CONTEXT_TYPE from synonyms.dictionary import Dictionary import re import os if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes counts and ppmi matrix for given corpus and dictionary") parser.add_argument('matrices', type=str, nargs='+', help='Matrices') args = parser.parse_args() dictionaries = [] matrices = args.matrices for matrix in args.matrices: dictionary = matrix.replace('.counts.mat', '.dict') check_input_file_exists(matrix) check_input_file_exists(dictionary) dictionaries.append(dictionary) for matrix, dictionary in zip(matrices, dictionaries): print('Processing:' + matrix) new_filename = matrix.replace('.counts.mat', '.ppmi') mat = load_mat(matrix) dict = Dictionary(dictionary) ppmi_matrix = transform(mat, dict) save_mat(ppmi_matrix, new_filename)
return AA def filter(rating): for word_id, data in rating.items(): for word, d in data['rating'].items(): del d['count'] if __name__ == '__main__': parser = argparse.ArgumentParser(description="Converts web log to tests") parser.add_argument('web_log', type=str, help='Web log in json') parser.add_argument('dictionary', type=str, help='Dictionary file') parser.add_argument('output_file', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.web_log) check_input_file_exists(args.dictionary) dictionary = Dictionary(args.dictionary) rating = {} antonyms = {} with open(args.web_log, encoding='utf-8') as file: web_log = json.load(file) for entry in web_log: word_d = entry['word'] words = entry['priority'] type = [x.strip() for x in entry['type'].split(':')] words = [x.strip() for x in words.split(':') if x not in [None, " ", ""]] entry = {} if type[0] in ['rating', 'antonyms']: words_sequence = {word: {'rel': rel, 'count': 1} for rel, word in zip(range(len(words), 0, -1), words)} entry = {'word': word_d, 'rating': words_sequence}
from scipy.io import mmread if __name__ == u'__main__': parser = argparse.ArgumentParser(description=u'TODO') parser.add_argument(u'u', type=unicode, help=u'File ') parser.add_argument(u's', type=unicode, help=u'File ') parser.add_argument(u'dictionary', type=unicode, help=u'File with saved dictionary') parser.add_argument(u'tests', type=unicode, help=u'File with saved tests') parser.add_argument(u'output_file', type=unicode, help=u'Name of the output file') parser.add_argument(u'--verbose', action=u'store_true', default=False) args = parser.parse_args() check_input_file_exists(args.s) check_input_file_exists(args.u) check_input_file_exists(args.dictionary) check_input_file_exists(args.tests) dictionary = Dictionary(filename=args.dictionary) tests = Test.load_tests(args.tests) with open(args.u) as uu, open(args.s) as ss: u = mmread(uu) s = mmread(ss) model = SVDModel(u, s, dictionary) with open(args.output_file, u'w') as file: file.write(u'# caron_p dimensions r_precision ndcg\n') for caron_p in [ 0.1, 0.15, 0.2, 0.25, 0.35, 0.5, 0.7, 1, 1.2, 1.5, 2
#!/usr/bin/env python3 from synonyms.lemmatizer.morphodita import lemmatize_and_replace_entities __author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists if __name__ == '__main__': parser = argparse.ArgumentParser(description='TODO') parser.add_argument('corpus', type=str, help='File containing corpus') parser.add_argument('output_corpus', type=str, help='Name of the output file') args = parser.parse_args() check_input_file_exists(args.corpus) lemmatize_and_replace_entities(args.corpus, args.output_corpus)
__author__ = 'veselt12' import argparse from synonyms.in_out.utils import check_input_file_exists, save_mat, load_mat from synonyms.ppmi import create_counts, transform, CONTEXT_TYPE from synonyms.dictionary import Dictionary import re import os if __name__ == '__main__': parser = argparse.ArgumentParser( description= "Computes counts and ppmi matrix for given corpus and dictionary") parser.add_argument('matrices', type=str, nargs='+', help='Matrices') args = parser.parse_args() dictionaries = [] matrices = args.matrices for matrix in args.matrices: dictionary = matrix.replace('.counts.mat', '.dict') check_input_file_exists(matrix) check_input_file_exists(dictionary) dictionaries.append(dictionary) for matrix, dictionary in zip(matrices, dictionaries): print('Processing:' + matrix) new_filename = matrix.replace('.counts.mat', '.ppmi') mat = load_mat(matrix) dict = Dictionary(dictionary) ppmi_matrix = transform(mat, dict) save_mat(ppmi_matrix, new_filename)