Пример #1
0
    def __init__(self):
        self.default_conjugator = mlconjug.Conjugator(language='en')
        self.lemmatizer = WordNetLemmatizer()

        # self.lemma_exceptions = {'am': 'be', 'are': 'be', 'is': 'be'}
        self.lemma_exceptions = {}
        self.prons_to_flip = {
            'your': 'my',
            'my': 'your',
            'yours': 'mine',
            'mine': 'yours',
            'there': 'here',
            'here': 'there'
        }
Пример #2
0
def test_get_verb_conjug(mocker):
    verb = "be"
    conjug = mlconjug.Conjugator(language="en")
    conjug_verb = conjug.conjugate(verb)
    mocker.patch.object(mlconjug.Conjugator,
                        "conjugate",
                        return_value=conjug_verb)
    mocker.patch.object(mlconjug.PyVerbiste.VerbEn,
                        'iterate',
                        return_value=[(verb, verb)])
    list_conjugs = action_extractor.get_verb_conjug([verb])

    assert mlconjug.Conjugator.conjugate.call_count == 1
    assert mlconjug.PyVerbiste.VerbEn.iterate.call_count == 1
    assert list_conjugs == [verb]
Пример #3
0
def get_verb_conjug(verb_list):
    """
    it conjugates te list of verbs that it receives as input
    :param verb_list: list
        list of verbs that we want to conjugate
    :return: list that contains all conjugations of the input verbs
    """
    verbs = []
    for verb in verb_list:
        default_conjugator = mlconjug.Conjugator(language='en')
        test_verb = default_conjugator.conjugate(verb)
        all_conjugated_forms = test_verb.iterate()
        verbs.append(
            list(set([verb_tuple[-1] for verb_tuple in all_conjugated_forms])))
    verbs = list(itertools.chain.from_iterable(verbs))
    return verbs
 def __init__(self):
     global conjugator
     global tenseKeyDict
     global subjectKeysDict
     conjugator = mlconjug.Conjugator(language='es')
     tenseKeyDict = {
         'preterite':
         ['Indicativo', 'Indicativo pretérito perfecto simple'],
         'present': ['Indicativo', 'Indicativo presente'],
         'imperfect': ['Indicativo', 'Indicativo pretérito imperfecto'],
         'subjunctive': ['Subjuntivo', 'Subjuntivo presente']
     }
     subjectKeysDict = {
         'yo': '1s',
         'tu': '2s',
         'el/ella': '3s',
         'nosotros': '1p',
         'ellos/ellas': '3p'
     }
    def try_fix_form(word_pos, syn_pos):
        word = word_pos[0]
        syn = syn_pos[0]
        pos_tag_word = word_pos[1]
        pos_tag_syn = syn_pos[1]

        if pos_tag_syn != pos_tag_word:
            # Check if its only plural version
            if pos_tag_word == pos_tag_syn + 'S':
                if pos_tag_syn.startswith('J'):
                    return en.superlative(syn)
                elif pos_tag_syn.startswith('N'):
                    return en.pluralize(syn)
            return None if pos_tag_syn[:2] != pos_tag_word[:2] else syn
        else:
            if not pos_tag_syn.startswith('V'):
                return syn
            # We check if verb is in correct form
            default_conjugator = mlconjug.Conjugator(language='en')

            if pos_tag_word == 'VB':
                return default_conjugator.conjugate(
                    syn).conjug_info['indicative']['indicative present']['1s']
            elif pos_tag_word == 'VBG':
                return default_conjugator.conjugate(syn).conjug_info[
                    'indicative']['indicative present continuous']['1s 1s']
            elif pos_tag_word == 'VBN':
                return default_conjugator.conjugate(syn).conjug_info[
                    'indicative']['indicative present perfect']['1p']
            elif pos_tag_word == 'VBP':
                return default_conjugator.conjugate(
                    syn).conjug_info['indicative']['indicative present']['1s']
            elif pos_tag_word == 'VBZ':
                return default_conjugator.conjugate(
                    syn).conjug_info['indicative']['indicative present']['3s']
            elif pos_tag_word == 'VBD':
                return default_conjugator.conjugate(syn).conjug_info[
                    'indicative']['indicative past tense']['1s']
import mlconjug

# Set the conjugator defining the language:
conjugator = mlconjug.Conjugator(language="pt")
# Get all the possible conjugations organized in a dictionary
# formated in the way of modo verbal>tempo verbal>pessoa verbal
verbo_ver_information = conjugator.conjugate("ver").conjug_info

# Printing every single verb
for modo_verbal, tempos_verbais in verbo_ver_information.items():
    print(str(modo_verbal).upper(), ':', sep='')

    for tempo_verbal, pessoas_verbais in tempos_verbais.items():
        print(tempo_verbal, ':', sep='')

        for pessoa_verbal in pessoas_verbais.items():
            print(pessoa_verbal)
Пример #7
0
        new_alias = "%s>%s" % (p.plural(w1), p.plural(w2))
        myvocabulary["NNS"].append(new_alias)
    else:
        myvocabulary["NNS"].append(p.plural(word))

### conjugation

## silencing some scipy warnings
import warnings

warnings.filterwarnings("ignore")

### (VBG)
import mlconjug

conj = mlconjug.Conjugator(language='en')


def get_vbg(baseform):
    return conj.conjugate(baseform).conjug_info['indicative'][
        'indicative present continuous']['1s 1s']


for word in myvocabulary["VB"]:
    if ">" in word:
        w1, w2 = word.split(">")
        new_alias = "%s>%s" % (get_vbg(w1), get_vbg(w2))
        myvocabulary["VBG"].append(new_alias)
    else:
        myvocabulary["VBG"].append(get_vbg(word))
Пример #8
0
def run(inputFilename, input_main_dir_path, outputPath,
        search_by_dictionary_var, search_by_keyword_var, keyword,
        lemmatization):

    IO_user_interface_util.timed_alert(
        GUI_util.window, 2000, 'Analysis start',
        "Started running the file search script at", True)

    if input_main_dir_path == '' and inputFilename != '':
        inputDir = os.path.dirname(inputFilename)
        files = [inputFilename]
    elif input_main_dir_path != '':
        inputDir = input_main_dir_path
        files = IO_files_util.getFileList(inputFilename, inputDir, 'txt')
    if len(files) == 0:
        return

    #print("files",files)
    for file in files:
        #print("file",file)
        if search_by_dictionary_var:
            break
        if search_by_keyword_var:
            output_dir_path = inputDir + os.sep + "search_result_csv"
            if not os.path.exists(output_dir_path):
                os.mkdir(output_dir_path)
            if file[-4:] != '.txt':
                continue

        kwtokens = word_tokenize(keyword)
        kwlist = [
        ]  #list of list which includes conjugated forms of each token in keyword phrase
        default_conjugator = mlconjug.Conjugator(language='en')

        for token in kwtokens:
            conjus = default_conjugator.conjugate(token)
            formlist = conjus.iterate()
            forms = []
            for form in formlist:
                forms.append(form[-1])
            kwlist.append(list(dict.fromkeys(forms)))  #reduce repetitions
        csvtitle = outputPath + '/' + os.path.split(
            os.path.split(outputPath)[0])[1] + "_" + keyword + '.csv'
        if lemmatization:
            csvtitle = outputPath + '/' + os.path.split(
                os.path.split(outputPath)[0])[1] + "_" + keyword + '_lemma.csv'
        csvExist = os.path.exists(csvtitle)
        with open(csvtitle, "a", newline="", encoding='utf-8',
                  errors='ignore') as csvFile:
            writer = csv.writer(csvFile)

            if csvExist == False:
                writer.writerow([
                    "Document ID", "Document", "Sentence ID", "SENTENCE",
                    "SEARCH_WORD", "LEMMATIZED",
                    "Sentence ID of FIRST_OCCURRENCE", "RELATIVE_POSITION",
                    "FREQUENCY of OCCURRENCE"
                ])
                docIndex = 1
            else:
                df = pd.read_csv(csvtitle, encoding="ISO-8859-1")
                if len(df) == 0:
                    docIndex = 1
                else:
                    docIndex = df.iloc[-1][0] + 1
            first_occurrence_index = 0
            frequency = 0
            contents = []
            head, docname = os.path.split(inputFilename)
            title = docname.partition('.')[0]
            f = open(file, "r", encoding='utf-8', errors='ignore')
            docText = f.read()
            f.close()
            sentences_ = sent_tokenize(
                docText)  #the list of sentences in corpus
            sentence_index = 1

            for sent in sentences_:
                tokens_ = word_tokenize(sent)
                kwindex = 0
                kw = False
                form = ''
                for token in tokens_:
                    t = token.lower()
                    if kwindex == len(kwlist):
                        break
                    if t == kwtokens[kwindex] or (
                            lemmatization and
                        (t in kwlist[kwindex]
                         or kwtokens[kwindex] == wordnet.morphy(t))
                    ):  #two ways to recognize the keyword
                        #(1) the form in corpus match item in the conjugation list(verbs)
                        #(2) the lemmatized form in corpus match the keyword token(for nouns or adjectives)
                        kw = True
                        kwindex += 1
                        form += t + " "
                    else:
                        kw = False
                        kwindex = 0
                        form = ''
                if len(form) > 0:
                    form = form[:-1]
                if kw == True:  #if keyword is detected, generate the next subfile
                    frequency += 1
                    if frequency == 1:
                        first_occurrence_index = sentence_index
                    if lemmatization:
                        writer.writerow([
                            docIndex, file, sentence_index, sent, keyword,
                            form, first_occurrence_index,
                            sentence_index / len(sentences_), frequency
                        ])
                    else:
                        writer.writerow([
                            docIndex, file, sentence_index, sent, keyword, '',
                            first_occurrence_index,
                            sentence_index / len(sentences_), frequency
                        ])
                    # writer.writerow([docIndex, inputFilename, presubfile, keyword, sent, first_occurrence_index, sentence_index / len(sentences_), frequency])
                else:
                    writer.writerow([
                        docIndex, file, sentence_index, sent, '', '', '', '',
                        ''
                    ])
                sentence_index += 1

    IO_user_interface_util.timed_alert(
        GUI_util.window, 2000, "Analysis end",
        "Finished running the file search script at", True)
Пример #9
0
import mlconjug

# # To use mlconjug with the default parameters and a pre-trained conjugation model.
# default_conjugator = mlconjug.Conjugator(language='fr')

# # Verify that the model works
# test1 = default_conjugator.conjugate("manger").conjug_info['Indicatif']['Passé Simple']['1p']
# test2 = default_conjugator.conjugate("partir").conjug_info['Indicatif']['Passé Simple']['1p']
# test3 = default_conjugator.conjugate("facebooker").conjug_info['Indicatif']['Passé Simple']['1p']
# test4 = default_conjugator.conjugate("astigratir").conjug_info['Indicatif']['Passé Simple']['1p']
# test5 = default_conjugator.conjugate("mythoner").conjug_info['Indicatif']['Passé Simple']['1p']
# print(test1)
# print(test2)
# print(test3)
# print(test4)
# print(test5)

# You can now iterate over all conjugated forms of a verb by using the newly added Verb.iterate() method.
default_conjugator = mlconjug.Conjugator(language='en')
test_verb = default_conjugator.conjugate("sit")
all_conjugated_forms = test_verb.iterate()
print(all_conjugated_forms)
Пример #10
0
class Conjug:
    # static methods
    __conjugator = mlconjug.Conjugator(language='es')
    __tense = "Indicativo pretérito perfecto simple"
    __record = {}

    # enum wasn't working the way i expected
    first_singular = "1s"
    second_singular = "2s"
    third_singular = "3s"
    first_plural = "1p"
    second_plural = "2p"
    third_plural = "3p"

    abbrev = {
        first_singular: "me ",
        second_singular: "te ",
        third_singular: "se ",
        first_plural: "nos ",
        second_plural: "os ",
        third_plural: "se "
    }

    def __init__(self):
        pass

    def get_all(self, infinitive: str):
        return {
            Conjug.first_singular: self.get(Conjug.first_singular, infinitive),
            Conjug.second_singular: self.get(Conjug.second_singular,
                                             infinitive),
            Conjug.third_singular: self.get(Conjug.third_singular, infinitive),
            Conjug.first_plural: self.get(Conjug.first_plural, infinitive),
            Conjug.second_plural: self.get(Conjug.second_plural, infinitive),
            Conjug.third_plural: self.get(Conjug.third_plural, infinitive)
        }

    def get(self, part_of_speech: str, infinitive: str) -> str:
        if infinitive.lower() in Conjug.__record:
            if Conjug.__record[infinitive][0] == part_of_speech.lower():
                return Conjug.__record[infinitive][1]

        og_verb = infinitive

        contains = infinitive[len(infinitive) - 2:len(infinitive)] == "se"

        if contains:
            infinitive = infinitive[0:len(infinitive) - 2]

        if infinitive == "pasar":
            if part_of_speech == Conjug.first_singular:
                Conjug.__record.update({
                    og_verb.lower(): (part_of_speech.lower(),
                                      ("me " if contains else "") + "pasé")
                })
                return ("me " if contains else "") + "pasé"
            elif part_of_speech == Conjug.second_singular:
                Conjug.__record.update({
                    og_verb.lower(): (part_of_speech.lower(),
                                      ("te " if contains else "") + "pasaste")
                })
                return ("te " if contains else "") + "pasaste"
            elif part_of_speech == Conjug.third_singular:
                Conjug.__record.update({
                    og_verb.lower(): (part_of_speech.lower(),
                                      ("se " if contains else "") + "pasó")
                })
                return ("se " if contains else "") + "pasó"
            elif part_of_speech == Conjug.first_plural:
                Conjug.__record.update({
                    og_verb.lower(): (part_of_speech.lower(),
                                      ("nos " if contains else "") + "pasamos")
                })
                return ("nos " if contains else "") + "pasamos"
            elif part_of_speech == Conjug.second_plural:
                Conjug.__record.update({
                    og_verb.lower():
                    (part_of_speech.lower(),
                     ("os " if contains else "") + "pasasteis")
                })
                return ("os " if contains else "") + "pasasteis"
            elif part_of_speech == Conjug.third_plural:
                Conjug.__record.update({
                    og_verb.lower(): (part_of_speech.lower(),
                                      ("se " if contains else "") + "pasaron")
                })
                return ("se " if contains else "") + "pasaron"

        try:
            iteratable = Conjug.__conjugator.conjugate(infinitive).iterate()
        except:
            return ''

        for conjugation in iteratable:
            if conjugation[1] == Conjug.__tense and conjugation[
                    2] == part_of_speech:
                correct = (Conjug.abbrev[part_of_speech]
                           if contains else "") + conjugation[3]
                Conjug.__record.update({
                    og_verb.lower(): (part_of_speech.lower(), correct.lower())
                })
                return correct
Пример #11
0
def run(inputFilename,
        outputPath,
        keyword,
        first_occurrence,
        lemmatization=True):
    title_keyword = keyword
    for letter in keyword:
        if letter == '<' or letter == '>' or letter == ':' or letter == '"' or letter == '/' or letter == '\\' or letter == '|' or letter == '?' or letter == '*':
            title_keyword = keyword.replace(letter, "")
    kwtokens = word_tokenize(keyword.lower())
    kwlist = [
    ]  #list of list which includes conjugated forms of each token in keyword phrase
    default_conjugator = mlconjug.Conjugator(language='en')
    if first_occurrence == True:
        outputPathone = outputPath + "/subfile_1"
        outputPathtwo = outputPath + "/subfile_2"
        if not os.path.exists(outputPathone) and not os.path.exists(
                outputPathtwo):
            os.mkdir(outputPathone)
            os.mkdir(outputPathtwo)

    for token in kwtokens:
        if token.isalpha():
            conjus = default_conjugator.conjugate(token.lower())
            formlist = conjus.iterate()
            forms = []
            for form in formlist:
                forms.append(form[-1])
            kwlist.append(list(dict.fromkeys(forms)))  #reduce repetitions
        else:  #mlconjug can't conjugate punctuation(if that's a part of the keyword)
            kwlist.append([token])
    csvtitle = outputPath + '/' + os.path.split(inputFilename)[1].split(
        ".")[0] + "_" + title_keyword + '.csv'
    csvExist = os.path.exists(csvtitle)
    with open(csvtitle, "a", newline="", encoding='utf-8',
              errors='ignore') as csvFile:
        writer = csv.writer(csvFile)
        if csvExist == False:
            writer.writerow([
                "Document ID", "Document", 'SPLIT_Document', "SEARCH_WORD",
                "SENTENCE", "Sentence ID of FIRST_OCCURRENCE",
                "RELATIVE_POSITION", "FREQUENCY of OCCURRENCE"
            ])
            docIndex = 1
        else:
            df = pd.read_csv(csvtitle, encoding="ISO-8859-1")
            if len(df) == 0:
                docIndex = 1
            else:
                docIndex = df.iloc[-1][0] + 1
        first_occurrence_index = 0
        frequency = 0
        contents = []
        head, docname = os.path.split(inputFilename)
        title = docname.partition('.')[0]
        f = open(inputFilename, "r", encoding='utf-8', errors='ignore')
        docText = f.read()
        f.close()
        sentences_ = sent_tokenize(docText)  #the list of sentneces in corpus
        subfileindex = 1
        subfilePath = outputPath + os.sep + title + "_" + str(
            subfileindex) + '.txt'
        if first_occurrence == True:
            subfilePath = outputPathone + os.sep + title + "_" + str(
                subfileindex) + '.txt'

        subfile = open(subfilePath, 'w', encoding='utf-8', errors='ignore')
        sentence_index = 1

        for sent in sentences_:
            tokens_ = word_tokenize(sent)
            kwindex = 0
            kw = False
            for token in tokens_:
                t = token.lower()
                if kwindex == len(kwlist):
                    break
                if t.lower() == kwtokens[kwindex] or (
                        lemmatization and
                    (t.lower() in kwlist[kwindex]
                     or kwtokens[kwindex] == wordnet.morphy(t))
                ):  #two ways to recognize the keyword
                    #(1) the form in corpus match item in the conjugation list(verbs)
                    #(2) the lemmatized form in corpus match the keyword token(for nouns or adjectives)
                    kw = True
                    kwindex += 1
                else:
                    kw = False
                    kwindex = 0
            if kw == True:  #if keyword is detected, generate the next subfile
                frequency += 1
                presubfile = subfilePath
                if frequency == 1:
                    first_occurrence_index = sentence_index
                if first_occurrence == False or frequency <= 1:
                    subfileindex += 1
                    subfilePath = outputPath + os.sep + title + "_" + str(
                        subfileindex) + '.txt'
                    if first_occurrence == True and subfileindex == 1:
                        subfilePath = outputPathone + os.sep + title + "_" + str(
                            subfileindex) + '.txt'
                    if first_occurrence == True and subfileindex == 2:
                        subfilePath = outputPathtwo + os.sep + title + "_" + str(
                            subfileindex) + '.txt'
                    subfile = open(subfilePath,
                                   'w',
                                   encoding='utf-8',
                                   errors='ignore')

                contents.append([
                    docIndex, inputFilename, presubfile, keyword, sent,
                    first_occurrence_index, sentence_index / len(sentences_),
                    frequency
                ])
                # writer.writerow([docIndex, inputFilename, presubfile, keyword, sent, first_occurrence_index, sentence_index / len(sentences_), frequency])
            subfile.write(sent + " ")
            sentence_index += 1
        # print(contents)
        l = len(contents)
        # print("length:",l)
        if l != 0 and first_occurrence:
            f = contents[-1][-1]
            if f > 1: f -= 1
            # print(f)
            subpath = contents[-1][2]
            for i in reversed(range(l)):
                # print(contents[i][2])
                if contents[i][2] == subpath:
                    contents[i][-1] = f
                # elif l > 1:
                #     f = contents[i][-1] - 1
                #     contents[i][-1] = f
                #     subpath = contents[i][2]
        writer.writerows(contents)