Пример #1
0
    def __init__(self, methodName='runTest'):
        self.current_dir = os.path.dirname(os.path.realpath(__file__))
        self.temp_dict1_path = tempfile.mkdtemp()
        self.dict1_file = open(os.path.join(self.current_dir, 'dict1.txt'))
        PyDicCreator().generate(self.dict1_file,
                                self.temp_dict1_path,
                                'dict1',
                                verbose=False)

        self.dict1 = PyDic(self.temp_dict1_path)
        self.dict1m = PyDic('dict1.txt')

        return super(TestPyDicBase, self).__init__(methodName)
Пример #2
0
    def run(self):
        """
        Runs as a command line tool
        """
        parser = argparse.ArgumentParser(
            description='Makes inflection of a flat text file with words.')

        parser.add_argument('-d', '--delimiter', default=u',')
        parser.add_argument('-f',
                            '--dictionary-file',
                            help="path to file with text dictionary",
                            required=True)
        parser.add_argument('-t', '--output', help="output file name")

        parser.add_argument('-b',
                            '--base-forms',
                            action="store_true",
                            help="only base forms")
        parser.add_argument('-v',
                            '--verbose',
                            action="store_true",
                            help="debug verbose mode")

        parser.add_argument('input',
                            metavar='FILE',
                            help="filename to process",
                            nargs='?')
        args = parser.parse_args()

        input_stream = sys.stdin
        if args.input:
            input_stream = open(args.input)

        output_stream = sys.stdout
        if args.output:
            output_stream = open(args.output, 'w')

        self.dictionary = PyDic(args.dictionary_file)
        self.index = self.load_index(self.dictionary)

        for line in input_stream:
            line = line.decode('utf-8').strip()
            if line and line[0] != '#':
                print >> output_stream, args.delimiter.join(
                    self.process(self.dictionary,
                                 self.index,
                                 line,
                                 debug=args.verbose)).encode('utf-8')
            else:
                print >> output_stream, line.encode('utf-8')
Пример #3
0
 def setUp(self):
     self.stemmer = PydicStemmer()
     self.dictionary = PyDic('dict1.txt')
     self.index = self.stemmer.build_index(self.dictionary)
Пример #4
0
 def load_dictionary(self, path):
     dic = PyDic(path)
     self.dictionaries[dic.name] = dic
### LEMMATISING ###

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from pattern3.fr import parse as frparse
from pattern3.nl import parse as nlparse
from pattern3.de import parse as deparse
from pattern3.it import parse as itparse
from pydic import PyDic
from pymystem3 import Mystem

if __name__ == "__main__":
    # Initialising Lemmatisers with logs
    print("Initialising lemmatiser for Polish...  ", end='\r')
    pl_dict = PyDic('pydic/odm.txt')
    print("Initialising lemmatiser for Russian... ", end='\r')
    ru_lemmatiser = Mystem()
    print("Initialising lemmatiser for English... ", end='\r')
    en_lemmatiser = WordNetLemmatizer()
    print("Done initialising lemmatisers.         ")


def pl_lemmatise(word):
    """
    Lemmatiser for Polish
    :param word: string
    :return: string
    """
    word_forms = pl_dict.word_base(word)
    if word_forms: