Exemplo n.º 1
0
    def dumpDictToPickle(self):
        #file = open(PATH_TO_RES + DICT + 'mueller4.txt')
        #file = open('../' + PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251')    #test
        #file = codecs.open('../' + PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251')    #test
        file = codecs.open(PATH_TO_RES + DICT + 'mueller4.txt', 'r',
                           'windows-1251')  #test

        results_dict = redict({})
        for line in file.readlines():
            if not line.startswith('_'):
                word, trans = line.strip().split('  ')
                #trans = unicode.encode(trans, 'utf-8')
                trans = unicode(trans)

                results_dict[word] = {
                    'rus':
                    trans.replace(trans[trans.find('['):trans.find(']') + 1],
                                  ''),
                    'eng':
                    word
                }

        file.close()

        #        dump = open('../' + PATH_TO_RES + RESULTING_DICT, 'w')
        dump = open(PATH_TO_RES + RESULTING_DICT, 'w')
        pickle.dump(results_dict, dump)

        del results_dict
        dump.close()
Exemplo n.º 2
0
    def parseTatoebaExamples(self):
        csv_examples = csv.reader(open(PATH_TO_RES + EXAMPLES), delimiter='\t')
        csv_links = csv.reader(open(PATH_TO_RES + LINKS), delimiter='\t')

        links = {}
        for link in csv_links:
            if links.has_key(link[0]):
                links[link[0]].append(link[1])
            else:
                links[link[0]] = [link[1]]

        examples = {}
        for example in csv_examples:
            if example[1] == 'eng' or example[1] == 'rus':
                try:
                    translation_ids = links[example[0]]
                    examples[example[0]] = {
                        'lang': example[1],
                        'sentence': example[2],
                        'translation': translation_ids
                    }
                except:
                    pass

        examples_dictionary = redict({})
        for item in examples:
            if examples[item]['lang'] == 'eng':
                for id in examples[item]['translation']:
                    try:
                        if examples[id]['lang'] == 'rus':
                            examples_dictionary[examples[item]['sentence']] = {
                                'eng': examples[item]['sentence'],
                                'rus': examples[id]['sentence']
                            }
                    except:
                        pass
        del links
        del examples

        dump = open(PATH_TO_RES + RESULTING_DICTIONARY, 'w')
        pickle.dump(examples_dictionary, dump)

        del examples_dictionary
        dump.close()

        print 'well, well, well'
Exemplo n.º 3
0
 def parseTatoebaExamples(self):
     csv_examples = csv.reader(open(PATH_TO_RES + EXAMPLES), delimiter='\t')
     csv_links = csv.reader(open(PATH_TO_RES + LINKS), delimiter='\t')
     
     links = {}
     for link in csv_links:
         if links.has_key(link[0]):
             links[link[0]].append(link[1])
         else:
             links[link[0]] = [ link[1] ]
     
     examples = {}
     for example in csv_examples:
         if example[1] == 'eng' or example[1] == 'rus':
             try:
                 translation_ids = links[example[0]]
                 examples[example[0]] = {'lang' : example[1], 'sentence' : example[2], 'translation' : translation_ids }
             except:
                 pass
             
     examples_dictionary = redict({})
     for item in examples:
         if examples[item]['lang'] == 'eng':
             for id in examples[item]['translation']:
                 try:
                     if examples[id]['lang'] == 'rus':
                         examples_dictionary[examples[item]['sentence']] = { 'eng' : examples[item]['sentence'], 'rus' : examples[id]['sentence'] }
                 except:
                     pass
     del links
     del examples
     
     dump = open(PATH_TO_RES + RESULTING_DICTIONARY, 'w')
     pickle.dump(examples_dictionary, dump)
     
     del examples_dictionary
     dump.close()
             
     print 'well, well, well'
Exemplo n.º 4
0
    def dumpDictToPickle(self):
        #file = open(PATH_TO_RES + DICT + 'mueller4.txt')
        #file = open('../' + PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251')    #test
        #file = codecs.open('../' + PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251')    #test
        file = codecs.open(PATH_TO_RES + DICT + 'mueller4.txt', 'r', 'windows-1251')    #test
        
        results_dict = redict({})
        for line in file.readlines():
            if not line.startswith('_'):
                word, trans = line.strip().split('  ')
                #trans = unicode.encode(trans, 'utf-8')
                trans = unicode(trans)
                
                results_dict[word] = { 'rus' : trans.replace(trans[trans.find('['): trans.find(']') + 1], ''), 'eng' : word }
                
        file.close()
        
#        dump = open('../' + PATH_TO_RES + RESULTING_DICT, 'w')
        dump = open(PATH_TO_RES + RESULTING_DICT, 'w')
        pickle.dump(results_dict, dump)
        
        del results_dict
        dump.close()
Exemplo n.º 5
0
 def __init__(self):
     self.dictionary = redict({})
Exemplo n.º 6
0
 def __init__(self):
     self.dictionary = redict({})