예제 #1
0
    parser = argparse.ArgumentParser(description='TODO')
    parser.add_argument('matrix',
                        type=str,
                        help='File containing U and S matrix in npz format')
    parser.add_argument('dictionary',
                        type=str,
                        help='File with saved dictionary')
    parser.add_argument(
        'target_words',
        type=str,
        help='File containing targets word, each word on one line')
    parser.add_argument('output_file',
                        type=str,
                        help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.matrix)
    check_input_file_exists(args.dictionary)
    check_input_file_exists(args.target_words)

    dictionary = Dictionary(filename=args.dictionary)

    words = []
    with open(args.target_words) as f:
        for line in f:
            words.append(line.strip())

    data = load_mat(args.matrix)
    model = SVDModel(data['U'],
                     data['s'],
                     dictionary,
                     caron_p=0.15,
예제 #2
0
#!/usr/bin/env python3
from synonyms.dictionary import Dictionary
from synonyms.in_out.readers import open_gz

__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="TODO")
    parser.add_argument('input_file', type=str, help='Input file with corpus in plain text')
    parser.add_argument('dictionary', type=str, help='Input file with dictionary')
    parser.add_argument('output_file', type=str, help='Output file where filtered version of corpus will be stored')
    args = parser.parse_args()
    check_input_file_exists(args.input_file)
    check_input_file_exists(args.dictionary)
    dictionary = Dictionary(filename=args.dictionary)
    with open_gz(args.output_file, 'w+', encoding='utf-8') as w, open_gz(args.input_file, encoding='utf-8') as r:
        for line in r:
            w.write(' '.join([word for word in line.lower().split() if word in dictionary])+'\n')
예제 #3
0
파일: gen.py 프로젝트: brmson/synonyms-pccp
from synonyms.dictionary import Dictionary
from synonyms.evaluation.test import Test
from synonyms.synonyms import SVDModel
from io import open
from scipy.io import mmread

if __name__ == u'__main__':
    parser = argparse.ArgumentParser(description=u'TODO')
    parser.add_argument(u'u', type=unicode, help=u'File ')
    parser.add_argument(u's', type=unicode, help=u'File ')
    parser.add_argument(u'dictionary', type=unicode, help=u'File with saved dictionary')
    parser.add_argument(u'tests', type=unicode, help=u'File with saved tests')
    parser.add_argument(u'output_file', type=unicode, help=u'Name of the output file')
    parser.add_argument(u'--verbose', action=u'store_true', default=False)
    args = parser.parse_args()
    check_input_file_exists(args.s)
    check_input_file_exists(args.u)
    check_input_file_exists(args.dictionary)
    check_input_file_exists(args.tests)

    dictionary = Dictionary(filename=args.dictionary)
    tests = Test.load_tests(args.tests)
    
    with open(args.u) as uu, open(args.s) as ss:
        u = mmread(uu)
        s = mmread(ss)
        model = SVDModel(u, s, dictionary)
        with open(args.output_file, u'w') as file:
            file.write(u'# caron_p dimensions r_precision ndcg\n')
            for caron_p in [0.1, 0.15, 0.2, 0.25, 0.35, 0.5, 0.7, 1, 1.2, 1.5, 2]:
                model.caron_p = caron_p
예제 #4
0
     description="Converts syn_v2 corpus in xml format to plain txt format")
 parser.add_argument('input_file',
                     type=str,
                     help='Input file with syn_v2 corpus in xml format')
 parser.add_argument(
     'output_file',
     type=str,
     help='Output file where plain version of corpus will be stored')
 parser.add_argument('-k',
                     '--keep-punctuation',
                     dest='keep',
                     action='store_true',
                     default=False,
                     help='Use this switch if you want to keep punctuation')
 parser.add_argument('-t',
                     '--keep-tags',
                     dest='keep_tags',
                     action='store_true',
                     default=False,
                     help='Use this switch if you want to keep tags')
 parser.add_argument(
     '-r',
     '--raw',
     dest='raw',
     action='store_true',
     default=False,
     help='Use this switch if you want to keep output raw text')
 args = parser.parse_args()
 check_input_file_exists(args.input_file)
 syn2_to_plain(args.input_file, args.output_file, args.keep, args.keep_tags,
               args.raw)
예제 #5
0
import sys

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='TODO')
    parser.add_argument('matrix',
                        type=str,
                        help='File containing U and S matrix in npz format')
    parser.add_argument('dictionary',
                        type=str,
                        help='File with saved dictionary')
    parser.add_argument('output_file',
                        type=str,
                        help='Name of the output file')
    parser.add_argument('--verbose', action='store_true', default=False)
    args = parser.parse_args()
    check_input_file_exists(args.matrix)
    check_input_file_exists(args.dictionary)
    dictionary = Dictionary(filename=args.dictionary)
    # with open(args.u) as uu, open(args.s) as ss:
    # u = mmread(args.u)
    # s = mmread(args.s)

    data = load_mat(args.matrix)
    model = SVDModel(data['U'], data['s'], dictionary)
    with open(args.output_file, 'w') as file:
        model.caron_p = 0.25
        model.dimensions = 400
        count = 0
        for word in dictionary.all_words():
            print("\r %d" % count, end='')
            synonyms, score = model.get_synonyms(word,
예제 #6
0
    return synonyms_map_cutoff


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='TODO')
    parser.add_argument('synonyms', type=str, help='File ')
    parser.add_argument('stop_list', type=str, help='File ')
    parser.add_argument('dictionary',
                        type=str,
                        help='File with saved dictionary')
    parser.add_argument('output_file',
                        type=str,
                        help='Name of the output file')
    parser.add_argument('--verbose', action='store_true', default=False)
    args = parser.parse_args()
    check_input_file_exists(args.synonyms)
    check_input_file_exists(args.stop_list)
    check_input_file_exists(args.dictionary)
    dictionary = Dictionary(filename=args.dictionary)
    dictionary.size()
    dictionary.word2count = {
        strip_word(word): count
        for word, count in dictionary.word2count.items()
    }
    synonyms_map = {}
    with open(args.synonyms) as file:
        for line in file:
            # print(line.split(' : '))
            target, synonyms = line.strip().split(' : ', 1)
            synonyms = synonyms.strip().split(' ')
            synonyms_map[target] = synonyms
예제 #7
0
import argparse
from synonyms.in_out.utils import check_input_file_exists, load_mat
from synonyms.dictionary import Dictionary
from synonyms.evaluation.test import Test
from synonyms.synonyms import SVDModel, SVDToOneModel

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='TODO')
    parser.add_argument('matrix', type=str, help='File containing U and S matrix in npz format')
    parser.add_argument('dictionary', type=str, help='File with saved dictionary')
    parser.add_argument('tests', type=str, help='File with saved tests')
    parser.add_argument('output_file', type=str, help='Name of the output file')
    parser.add_argument('model', choices=['Normal', 'ToOne'], help='Context type')
    parser.add_argument('--verbose', action='store_true', default=False)
    args = parser.parse_args()
    check_input_file_exists(args.matrix)
    check_input_file_exists(args.dictionary)
    check_input_file_exists(args.tests)

    dictionary = Dictionary(filename=args.dictionary)
    tests = Test.load_tests(args.tests)
    data = load_mat(args.matrix)
    if args.model == 'ToOne':
        model = SVDToOneModel(data['U'], data['s'], dictionary)
    else:
        model = SVDModel(data['U'], data['s'], dictionary)
    with open(args.output_file, 'w') as file:
        if args.model == 'ToOne':
            file.write('# to_one dimensions r_precision ndcg\n')
            for to_one in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
                model.to_one = to_one
예제 #8
0
#!/usr/bin/env python3
from synonyms.in_out.readers import open_gz

__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes counts and ppmi matrix for given corpus and dictionary")
    parser.add_argument('corpus', type=str, help='Corpus')
    parser.add_argument('word_count', type=int, help='Word count')
    parser.add_argument('postfix_length', type=int)
    parser.add_argument('output_file', type=str, help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.corpus)

    max_count = args.word_count
    with open_gz(args.corpus) as input:
        word_count = 0
        file_count = 0
        line_1 = None
        end_of_file = False
        output = None
        for line in input:
            line = line.strip()
            if not output:
                output = open_gz(args.output_file + ('.%0' + str(args.postfix_length) + 'd') % file_count + '.gz', 'w')
            if line == '\n':
                output.write('\n')
                continue
예제 #9
0
            if synonym not in synonyms_cutoff and synonym != target:
                synonyms_cutoff.append(synonym)
        if len(synonyms_cutoff) != 0:
            synonyms_map_cutoff[target] = synonyms_cutoff
    return synonyms_map_cutoff


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='TODO')
    parser.add_argument('synonyms', type=str, help='File ')
    parser.add_argument('stop_list', type=str, help='File ')
    parser.add_argument('dictionary', type=str, help='File with saved dictionary')
    parser.add_argument('output_file', type=str, help='Name of the output file')
    parser.add_argument('--verbose', action='store_true', default=False)
    args = parser.parse_args()
    check_input_file_exists(args.synonyms)
    check_input_file_exists(args.stop_list)
    check_input_file_exists(args.dictionary)
    dictionary = Dictionary(filename=args.dictionary)
    dictionary.size()
    dictionary.word2count = {strip_word(word): count for word, count in dictionary.word2count.items()}
    synonyms_map = {}
    with open(args.synonyms) as file:
        for line in file:
            # print(line.split(' : '))
            target, synonyms = line.strip().split(' : ', 1)
            synonyms = synonyms.strip().split(' ')
            synonyms_map[target] = synonyms
    stop_list = set()
    with open(args.stop_list) as file:
        for line in file:
                        choices=range(1, 4),
                        help='Context size')
    parser.add_argument(
        'context_type',
        choices=[CONTEXT_TYPE.L, CONTEXT_TYPE.R, CONTEXT_TYPE.LR],
        help='Context type')
    parser.add_argument('output_file',
                        type=str,
                        help='Name of the output file')
    parser.add_argument('corpuses',
                        type=str,
                        nargs='+',
                        help='Corpus in plain format')
    args = parser.parse_args()
    for corpus in args.corpuses:
        check_input_file_exists(corpus)
    check_input_file_exists(args.dictionary)

    dictionary = Dictionary(filename=args.dictionary)
    regex = re.compile(".*\.([0-9]*)\..*")
    for corpus in args.corpuses:
        number = regex.findall(corpus)[0]
        print('Processing:' + corpus)
        new_filename = args.output_file + '.' + number
        new_dict = dictionary.create_reseted_copy(corpus)

        counts_matrix = create_counts(corpus, new_dict, args.context_size,
                                      args.context_type)
        print(counts_matrix.shape)
        new_dict.save(new_filename + '.dict')
        save_mat(counts_matrix, new_filename + '.counts')
예제 #11
0
#!/usr/bin/env python3
__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists
from synonyms.corpus.utils import syn2_to_plain


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Converts syn_v2 corpus in xml format to plain txt format")
    parser.add_argument('input_file', type=str, help='Input file with syn_v2 corpus in xml format')
    parser.add_argument('output_file', type=str, help='Output file where plain version of corpus will be stored')
    parser.add_argument('-k', '--keep-punctuation',
                        dest='keep',
                        action='store_true',
                        default=False,
                        help='Use this switch if you want to keep punctuation')
    parser.add_argument('-t', '--keep-tags',
                        dest='keep_tags',
                        action='store_true',
                        default=False,
                        help='Use this switch if you want to keep tags')
    parser.add_argument('-r', '--raw',
                        dest='raw',
                        action='store_true',
                        default=False,
                        help='Use this switch if you want to keep output raw text')
    args = parser.parse_args()
    check_input_file_exists(args.input_file)
    syn2_to_plain(args.input_file, args.output_file, args.keep, args.keep_tags, args.raw)
예제 #12
0
#!/usr/bin/env python3
from synonyms.part_of_speech.model import POS

__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Creates pos model for data")
    parser.add_argument('data', type=str, help='Data')
    parser.add_argument('output_file', type=str, help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.data)

    pos = POS.create(args.data)
    pos.save(args.output_file)
예제 #13
0
#!/usr/bin/env python3
__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists, load_mat
from synonyms.dictionary import Dictionary
from synonyms.evaluation.test import Test
from synonyms.synonyms import SVDModel

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='TODO')
    parser.add_argument('matrix', type=str, help='File containing U and S matrix in npz format')
    parser.add_argument('dictionary', type=str, help='File with saved dictionary')
    parser.add_argument('target_words', type=str, help='File containing targets word, each word on one line')
    parser.add_argument('output_file', type=str, help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.matrix)
    check_input_file_exists(args.dictionary)
    check_input_file_exists(args.target_words)

    dictionary = Dictionary(filename=args.dictionary)

    words = []
    with open(args.target_words) as f:
        for line in f:
            words.append(line.strip())

    data = load_mat(args.matrix)
    model = SVDModel(data['U'], data['s'], dictionary, caron_p=0.15, dimensions=2500)
    i = 1
    with open(args.output_file, 'w') as file:
        for target_word in words:
예제 #14
0
#!/usr/bin/env python3

__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists, save_mat
from synonyms.ppmi import create_counts, transform, CONTEXT_TYPE
from synonyms.dictionary import Dictionary


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes counts and ppmi matrix for given corpus and dictionary")
    parser.add_argument('corpus', type=str, help='Corpus in plain format')
    parser.add_argument('dictionary', type=str, help='File with saved dictionary')
    parser.add_argument('context_size', type=int, choices=range(1, 4), help='Context size')
    parser.add_argument('context_type', choices=[CONTEXT_TYPE.L, CONTEXT_TYPE.R, CONTEXT_TYPE.LR], help='Context type')
    parser.add_argument('output_file', type=str, help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.corpus)
    check_input_file_exists(args.dictionary)

    dictionary = Dictionary(filename=args.dictionary)
    counts_matrix = create_counts(args.corpus, dictionary, args.context_size, args.context_type)
    save_mat(counts_matrix, args.output_file + '.counts')
    ppmi_matrix = transform(counts_matrix, dictionary)
    save_mat(ppmi_matrix, args.output_file + '.ppmi')
예제 #15
0
#!/usr/bin/env python3
__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists
from synonyms.evaluation.test import Test

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='TODO')
    parser.add_argument('tests1', type=str, help='File with saved tests1')
    parser.add_argument('tests2', type=str, help='File with saved tests2')
    parser.add_argument('output_file', type=str, help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.tests1)
    check_input_file_exists(args.tests2)

    tests1 = Test.load_tests(args.tests1)
    tests2 = Test.load_tests(args.tests2)
    tests = Test.intersect(tests1, tests2)
    Test.save_tests(tests,args.output_file)
예제 #16
0
    parser.add_argument('matrix_U',
                        type=str,
                        help='File containing U matrix in npy format')
    parser.add_argument('matrix_s',
                        type=str,
                        help='File containing S matrix in npy format')
    parser.add_argument('dictionary',
                        type=str,
                        help='File with saved dictionary')
    parser.add_argument('tests', type=str, help='File with saved tests')
    parser.add_argument('output_file',
                        type=str,
                        help='Name of the output file')
    parser.add_argument('--verbose', action='store_true', default=False)
    args = parser.parse_args()
    check_input_file_exists(args.matrix_U)
    check_input_file_exists(args.matrix_s)
    check_input_file_exists(args.dictionary)
    check_input_file_exists(args.tests)

    dictionary = Dictionary(filename=args.dictionary)
    tests = Test.load_tests(args.tests)
    U = np.load(args.matrix_U, mmap_mode='r')
    s = np.load(args.matrix_s, mmap_mode='r')
    model = SVDModelMemMap(U, s, dictionary)
    print('model loaded')
    #with open(args.output_file, 'w') as file:
    # file.write('# caron_p dimensions r_precision ndcg\n')
    # for caron_p in [0.15, 0.25, 0.35, 0.5, 0.7, 1, 1.2, 1.5, 2]:
    #     model.caron_p = caron_p
    #     for dimensions in [2000, 1000, 500, 200, 100]: #[10000, 8000, 6000, 4000, 2500, 2000, 1000, 500, 200, 100]:
예제 #17
0
#!/usr/bin/env python3

__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists, save_mat, load_mat
from synonyms.ppmi import create_counts, transform, CONTEXT_TYPE
from synonyms.dictionary import Dictionary
import re
import os

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes counts and ppmi matrix for given corpus and dictionary")
    parser.add_argument('matrices', type=str, nargs='+', help='Matrices')
    args = parser.parse_args()
    dictionaries = []
    matrices = args.matrices
    for matrix in args.matrices:
        dictionary = matrix.replace('.counts.mat', '.dict')
        check_input_file_exists(matrix)
        check_input_file_exists(dictionary)
        dictionaries.append(dictionary)

    for matrix, dictionary in zip(matrices, dictionaries):
        print('Processing:' + matrix)
        new_filename = matrix.replace('.counts.mat', '.ppmi')
        mat = load_mat(matrix)
        dict = Dictionary(dictionary)
        ppmi_matrix = transform(mat, dict)
        save_mat(ppmi_matrix, new_filename)
예제 #18
0
    return AA


def filter(rating):
    for word_id, data in rating.items():
        for word, d in data['rating'].items():
            del d['count']


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Converts web log to tests")
    parser.add_argument('web_log', type=str, help='Web log in json')
    parser.add_argument('dictionary', type=str, help='Dictionary file')
    parser.add_argument('output_file', type=str, help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.web_log)
    check_input_file_exists(args.dictionary)
    dictionary = Dictionary(args.dictionary)
    rating = {}
    antonyms = {}
    with open(args.web_log, encoding='utf-8') as file:
        web_log = json.load(file)
        for entry in web_log:
            word_d = entry['word']
            words = entry['priority']
            type = [x.strip() for x in entry['type'].split(':')]
            words = [x.strip() for x in words.split(':') if x not in [None, " ", ""]]
            entry = {}
            if type[0] in ['rating', 'antonyms']:
                words_sequence = {word: {'rel': rel, 'count': 1} for rel, word in zip(range(len(words), 0, -1), words)}
                entry = {'word': word_d, 'rating': words_sequence}
예제 #19
0
from scipy.io import mmread

if __name__ == u'__main__':
    parser = argparse.ArgumentParser(description=u'TODO')
    parser.add_argument(u'u', type=unicode, help=u'File ')
    parser.add_argument(u's', type=unicode, help=u'File ')
    parser.add_argument(u'dictionary',
                        type=unicode,
                        help=u'File with saved dictionary')
    parser.add_argument(u'tests', type=unicode, help=u'File with saved tests')
    parser.add_argument(u'output_file',
                        type=unicode,
                        help=u'Name of the output file')
    parser.add_argument(u'--verbose', action=u'store_true', default=False)
    args = parser.parse_args()
    check_input_file_exists(args.s)
    check_input_file_exists(args.u)
    check_input_file_exists(args.dictionary)
    check_input_file_exists(args.tests)

    dictionary = Dictionary(filename=args.dictionary)
    tests = Test.load_tests(args.tests)

    with open(args.u) as uu, open(args.s) as ss:
        u = mmread(uu)
        s = mmread(ss)
        model = SVDModel(u, s, dictionary)
        with open(args.output_file, u'w') as file:
            file.write(u'# caron_p dimensions r_precision ndcg\n')
            for caron_p in [
                    0.1, 0.15, 0.2, 0.25, 0.35, 0.5, 0.7, 1, 1.2, 1.5, 2
예제 #20
0
#!/usr/bin/env python3
from synonyms.lemmatizer.morphodita import lemmatize_and_replace_entities

__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='TODO')
    parser.add_argument('corpus', type=str, help='File containing corpus')
    parser.add_argument('output_corpus',
                        type=str,
                        help='Name of the output file')
    args = parser.parse_args()
    check_input_file_exists(args.corpus)
    lemmatize_and_replace_entities(args.corpus, args.output_corpus)
예제 #21
0
__author__ = 'veselt12'
import argparse
from synonyms.in_out.utils import check_input_file_exists, save_mat, load_mat
from synonyms.ppmi import create_counts, transform, CONTEXT_TYPE
from synonyms.dictionary import Dictionary
import re
import os

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        "Computes counts and ppmi matrix for given corpus and dictionary")
    parser.add_argument('matrices', type=str, nargs='+', help='Matrices')
    args = parser.parse_args()
    dictionaries = []
    matrices = args.matrices
    for matrix in args.matrices:
        dictionary = matrix.replace('.counts.mat', '.dict')
        check_input_file_exists(matrix)
        check_input_file_exists(dictionary)
        dictionaries.append(dictionary)

    for matrix, dictionary in zip(matrices, dictionaries):
        print('Processing:' + matrix)
        new_filename = matrix.replace('.counts.mat', '.ppmi')
        mat = load_mat(matrix)
        dict = Dictionary(dictionary)
        ppmi_matrix = transform(mat, dict)
        save_mat(ppmi_matrix, new_filename)