Python TreeTagger примеры, treetaggerwrapper.TreeTagger Python примеры использования

Пример #1

0

Показать файл

def main(argv):
    parser = get_parser()
    preprocessor = PreprocessingClass()
    tagger_en = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR="C:\\TreeTagger")
    tagger_es = treetaggerwrapper.TreeTagger(TAGLANG='es', TAGDIR="C:\\TreeTagger")
    tagger_nl = treetaggerwrapper.TreeTagger(TAGLANG='nl', TAGDIR="C:\\TreeTagger")

    (options, args) = parser.parse_args(argv)

    if not (options.input and options.output):
        parser.error("Required arguments not provided")
    else:
        lang = detect_language(options.input)
        if lang.lower() not in ['en', 'es', 'nl']:
            print >> sys.stderr, 'Language other than en, es, nl'
            sys.exit(1)
        else:
            print
            print "Current Language: ", lang

            # final_model_path = options.model

            final_model_path = change_path_to_windows_style(options.model)
            final_output_path = change_path_to_windows_style(options.output)

            if lang.lower() == "en":
                main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2)

                dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower())
                X, y_author = preprocessor.split_lists_dev(dataset_input)
                X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_en)
                pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author)
                main_classifier.dataset_statistics_dev(X, y_author)
                # load models
                gender_model, age_model = load_model(final_model_path, lang.lower())
                main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path, age_model)

            elif lang.lower() == "nl":
                main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2)

                dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower())
                X, y_author = preprocessor.split_lists_dev(dataset_input)
                X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_nl)
                pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author)
                main_classifier.dataset_statistics_dev(X, y_author)
                # load models
                gender_model = load_model(final_model_path, lang.lower())
                main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path)

            elif lang.lower() == "es":
                main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2)

                dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower())
                X, y_author = preprocessor.split_lists_dev(dataset_input)
                X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_es)
                pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author)
                main_classifier.dataset_statistics_dev(X, y_author)
                # load models
                gender_model, age_model = load_model(final_model_path, lang.lower())
                main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path, age_model)

Пример #2

0

Показать файл

 def __init__(self, index_path="/INDEX"):
     self.index = {}
     self.index_document = {}
     self.plain_word_fr = re.compile("ABR|ADJ|NAM|NOM|VER")
     self.plain_word_en = re.compile("JJ|NP|NN|VB")
     self.fr_tagger = treetaggerwrapper.TreeTagger(TAGLANG="fr")
     self.en_tagger = treetaggerwrapper.TreeTagger(TAGLANG="en")
     self.save_folder = index_path
     self.keep_path = self.save_folder + "/documentsIndex"
     self.index_name = self.save_folder + "/index.json"
     self.index_document_name = self.save_folder + "/index_document.json"

Пример #3

0

Показать файл

Файл: general_functions.py Проект: elespdn/collation_spelling

def tag_poslemma(dirName):     
    
    # TAGGED-ALL
    writer=csv.writer(open('dictionaries/taggedAll_' + dirName + '.csv', 'w'))
    
    # OPEN FILES IN DIRECTORY
    for witness in os.listdir('data/' + dirName):
        if witness.endswith(".txt"):
            with open('data/' + dirName + '/' + witness) as wit:
                witText = wit.read()

                # TAG USING FRO
                taggerFro = treetaggerwrapper.TreeTagger(TAGLANG='froBfm')
                tagsFro = taggerFro.tag_text(witText)  # LIST WITH TAGGED WORDS (FRO)

                # TAG USING STEIN
                taggerStein = treetaggerwrapper.TreeTagger(TAGLANG='stein')
                tagsSteinDirt = taggerStein.tag_text(witText) # dirst, because has too much info and symbols
                tagsSteinStr = '\n'.join(tagsSteinDirt)  # list to string  
                # CLEAN OUTPUT STEIN
                patterns = [('_.*', ''),
                            ('\d.*', ''),
                          # ('\|.*', ''), so that the different output possibilities are saved
                            ('�', 'ö'),  # encoding problem, but it does not seem to depend on the TreeTaggerWrapper, nor on the script. Maybe on the lexicon? Anyway, this is not real solution but works
                            ('<nolem>', 'UNKNOWN')]
                for (p1,p2) in patterns:
                    p = re.compile(p1)
                    tagsSteinStr = p.sub(p2, tagsSteinStr)
                tagsStein = tagsSteinStr.split('\n')  # LIST WITH TAGGED WORDS (STEIN)
                
                for itemFro, itemStein in zip(tagsFro, tagsStein):
                    token = itemFro.split('\t')[0]
                    pos = itemFro.split('\t')[1]
                    lemma = itemStein.split('\t')[2]
                    item = token + '\t' + pos + '_' + lemma
                    writer.writerow([item]) # populate the file with items (made by token, pos and lemma)
                    

    # TAGGED-DISTINCT
    reader=csv.reader(open('dictionaries/taggedAll_' + dirName + '.csv', 'r'), delimiter='\t')
    writer=csv.writer(open('dictionaries/taggedDistinct_' + dirName + '.csv', 'w'), delimiter=',')
    entries = set()
    writer.writerow(['Original', 'Normalised'])
    for row in reader:
        key = (row[0], row[1])
        if key not in entries:
            writer.writerow(row)
            entries.add(key)

Пример #4

0

Показать файл

Файл: semantic_analysis.py Проект: mglorion/semiotweet

def tokenize_and_lemmatize_tweets(listTweets):
    """Tokenize & lemmatize a list of texts"""
    global french_stop_words
    global mention_regex
    global LOCALTAGDIR

    # Setting up TreeTagger
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGDIR=LOCALTAGDIR)

    for t in listTweets:
        text = mention_regex.sub("", t["text"]).lower()
        tags = tagger.tag_text(text)
        tags = treetaggerwrapper.make_tags(tags)
        tokens = []
        lemma = []
        # Filtering
        for tag in tags:
            if hasattr(tag, 'word'):
                if not (len(tag.lemma) < 2 or tag.lemma in french_stop_words):
                    tokens.append(tag.word)
                    lemma.append(tag.lemma)
            else:
                token = tag.what
                if not (len(token) < 2 or token in french_stop_words):
                    if token.startswith("<repurl") or token.startswith(
                            "<repdns"):
                        token = token[token.find('"') + 1:token.rfind('"')]
                    else:
                        lemma.append(token)
                    tokens.append(token)

        t["tokenArray"] = tokens
        t["lemmaArray"] = lemma

    return listTweets

Пример #5

0

Показать файл

def postag_directory(input_directory, output_directory):
    """
    This function POS-tag a directory full of documents (opinions, tweets, comments)


    Args:
        input_directory: The

        output_directory:

    """
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
    # Loop over the files
    all_tags = []
    for filename in sorted(glob.glob(os.path.join(input_directory, '*.txt'))):
        with codecs.open(filename, encoding='utf-8') as f:
            # Read the file
            content = f.read()
            # Tag it
            tags = tagger.tag_text(content)
            # add those tags to the master tag list
            all_tags.append(tags)

    for i, a_list in enumerate(all_tags):
        new_dir_path = output_directory
        path = os.path.join(new_dir_path, "list%d.txt" % i)
        with open(path, "w") as f:
            for item in a_list:
                f.write(item + "\n")

Пример #6

0

Показать файл

def postag_pandas(input_file, output_file):
    def postag_string(s):
        '''Returns tagged text from string s'''
        if isinstance(s, basestring):
            s = s.decode('UTF-8')
        return tagger.tag_text(s)

    # Reading in the file
    all_lines = []
    with open(input_file) as f:
        for line in f:
            all_lines.append(line.strip().split('|', 1))

    df = pd.DataFrame(all_lines[1:], columns=all_lines[0])

    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')

    df['POS-tagged_content'] = df['content'].apply(postag_string)

    # Format fix:
    def fix_format(x):
        '''x - a list or an array'''
        # With encoding:
        out = list(tuple(i.encode().split('\t')) for i in x)
        # or without:
        # out = list(tuple(i.split('\t')) for i in x)
        return out

    df['POS-tagged_content'] = df['POS-tagged_content'].apply(fix_format)
    df['content'] = df['content'].map(lambda x: x.lstrip('"""' ''))

    print list(df.columns.values)
    return df.to_csv(output_file, sep='|', index=False)

Пример #7

0

Показать файл

Файл: topic_model.py Проект: izzbizz/crisis-reporting

	def __init__(self, input, output):#, tagdir='/opt/treetagger'):
		self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')#, TAGDIR=tagdir)
		self.inp = input
		self.outp = output
		with open(self.inp + 'news_stops') as f:
			self.stopwords = f.read().strip().split()	
		self.excltags = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'RP', 'SYM', 'TO', 'WDT', 'WP', 'WP$', 'WRB', 'VB', 'VBD', 'VBG', 'VBN', 'VBZ', 'VBP', 'VD', 'VDD', 'VDG', 'VDN', 'VDZ', 'VDP', 'VH', 'VHD', 'VHG', 'VHN', 'VHZ', 'VHP', 'RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS']

Пример #8

0

Показать файл

Файл: TreeTagerParser.py Проект: Garad1/Projet_Extraction

def Tagger(dataset):
    # ------------Ouverture des fichiers-----------------------------------#
    D = open(dataset, 'r')
    Tag = open("DataTAG.txt", 'w')

    # ------------Liste des tags a prendre en concideration-----------------#
    ListeTags = [
        "JJ", "JJR", "JJS", "VV", "VVD", "VVG", "VVN", "RBR", "RBS", "UH", "RB"
    ]

    # ------------Config du wrapper------------------------------------------#
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',
                                          TAGDIR='./TreeTagger',
                                          TAGINENC='utf-8',
                                          TAGOUTENC='utf-8')

    # ------------Analyse:----------------------------------------------------#
    # Pour chaque ligne du dataset ,treetagger analyse le commentaire
    # et renvoie les mots importants dans le fichier DataTAG

    for line in D:
        line = re.sub('http:\/\/[0-9a-zA-Z-_\.]*(\.[a-z]{0,9}\/?)?',
                      "website ", line)  #supression des url
        line = re.sub(
            "[^a-zA-Z0-9 '-]", " ", line
        )  # suppression de tous les carracteres speciaux (sauf :',- et l'espace)
        tags = tagger.TagText(line.decode(
            encoding="utf-8"))  # "taggage" de chaque mot du commentaire
        for words in tags:  #recuperations des mots qui possedent les tags de ListTags
            w = words.split("\t")

            if w[1] in ListeTags:
                Tag.write(w[0] + " ")
        Tag.write("\n")

Пример #9

0

Показать файл

Файл: morphy.py Проект: kaharjan/pawn

 def _set_treetagger(self, language):
     import treetaggerwrapper as ttw
     try:
         self._tagger = ttw.TreeTagger(TAGLANG=language)
         self.morphy = self._treetagger_morphy
     except ttw.TreeTaggerError:
         raise (ImportError)

Пример #10

0

Показать файл

Файл: PoolOfThreadedTreeTaggers2.py Проект: jkpubsrc/python-module-jk-treetaggerwrapper

	def _useTagger(self, langID:str):
		assert isinstance(langID, str)

		with self.__mainLock:
			langIDCache = self.__unused.get(langID, None)
			if langIDCache is None:
				langIDCache = PoolOfThreadedTreeTaggers._LangSpecificCache(langID)
				self.__unused[langID] = langIDCache

		langIDCache.touch()

		if langIDCache.idleInstances:
			with langIDCache.langLock:
				tagger = langIDCache.idleInstances[-1]
				del langIDCache.idleInstances[-1]
				langIDCache.countUsedInstances += 1
		else:
			tagger = ttpw.TreeTagger(
				TAGLANG=langID,
				TAGOPT="-prob -threshold 0.7 -token -lemma -sgml -quiet",
				TAGDIR=self.__treeTaggerInstallationPath)
			self.__onTaggerCreated.fire(self, langID)
			with langIDCache.langLock:
				langIDCache.countUsedInstances += 1

		try:
			yield tagger
		finally:
			with langIDCache.langLock:
				langIDCache.countUsedInstances -= 1
				langIDCache.idleInstances.append(tagger)

Пример #11

0

Показать файл

Файл: make_word_level_graph.py Проект: mnishz/make_word_level_graph

def get_level(file_name):
    tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=os.getcwd())
    tags = tagger.tag_file(file_name + '.txt')
    tags2 = ttw.make_tags(tags)

# with open(file_name + '.tag', 'w') as f:
#     for tag in tags:
#         f.write("%s\n" % tag)

    import re

    words = []

    for tag in tags2:
        if re.search('^\w', tag.lemma):
            for word in tag.lemma.lower().split('-'):
                words.append(word)

    words = list(set(words))

    import word_level

    sentence_level = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    unknown_words = []

    for word in words:
        level = word_level.get_level(word)
        sentence_level[level] += 1
        if level == 0:
            unknown_words.append(word)

    print(sentence_level)
    print(unknown_words)

    return sentence_level

Пример #12

0

Показать файл

 def __init__(self, text):
     tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR='/home/lr/hayashi/ra_web_app')
     self.text = text
     #小文字にすると拾えない
     self.sentences = sent_tokenize(self.text)
     self.tagged = [tagger.TagText(sentence) for sentence in self.sentences]
     self.parsed = [' '.join(sentence).replace('\t', '_') for sentence in self.tagged]

Пример #13

0

Показать файл

Файл: train_en.py Проект: takotakot/unsupervised-pos-tagging

def build_corpus(filename):
    corpus = ihmm.corpus()
    # 訓練データを形態素解析して各品詞ごとにその品詞になりうる単語の総数を求めておく
    sentence_list = []
    with codecs.open(filename, "r", "utf-8") as f:
        for sentence_str in f:
            sentence_list.append(sentence_str)
    with codecs.open(filename, "r", "utf-8") as f:
        tagger = treetaggerwrapper.TreeTagger(TAGLANG="en")
        for i, sentence_str in enumerate(f):
            sentence_str = sentence_str.strip()
            if (i + 1) % 10 == 0:
                printr("データを準備しています ... {}".format(i + 1))
            result = tagger.tag_text(sentence_str)
            if len(result) == 0:
                continue
            # 形態素解析を行いながら訓練データも作る
            # 英語は通常スペース区切りなので不要と思うかもしれないが、TreeTaggerを使うと$600が$ 600に分割されたりする
            # そのためplot_en.pyで評価の際に文の単語数が[スペース区切り]と[TreeTagger]で異なる場合があり正しく評価を行えなくなる
            # よって単語分割は全てTreeTaggerによるものに統一しておく
            words = []
            for metadata in result:
                metadata = metadata.split("\t")
                if len(metadata) == 3:
                    word, true_tag, lowercase = metadata
                    true_tag = collapse_true_tag(true_tag, lowercase)
                else:
                    lowercase = metadata[0]
                words.append(lowercase)
            # データを追加
            corpus.add_words(words)

    return corpus

Пример #14

0

Показать файл

Файл: CookBot.py Проект: ThibaultLasou/CookBot

def postTreatment(TW):
# Fonction destinée a transformer les nombres écrits en lettres en chiffres
# non implémenté : rassemblement de token : pomme de terre en un seul token par exemple...
def preTreatment(TW):
    for w in TW:
        if w.posTag == "NUM" and w.lemma != "@card@":
            try:
                w.word = numbers[w.word]
            except:
                ans = input("Je ne comprends pas bien ce nombre : " + w.word + ". Pourriez-vous l'écrire en chiffres ?\n")
                taggedAns = formatTTG(tagger.TagText(ans))
                for wAns in taggedAns:
                    if wAns.posTag == "NUM" and wAns.lemma == "@card@":
                        numbers[w.word] = int(wAns.word)
                        w.word = numbers[w.word]
    return TW
    
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGDIR='./Ressources', TAGINENC='utf-8', TAGOUTENC='utf-8')
demande = input("Bonjour ! Que puis-je pour vous aujourd'hui ?\n")
taggedWords = formatTTG(tagger.TagText(demande))
taggedWords = postTreatment(taggedWords)

if(eatingIntention(taggedWords)):
    r = Request()
    r.fillRequest(taggedWords)
    r.printRequest()

Пример #15

0

Показать файл

Файл: foncStats.py Проект: AlexandrWeber/Projet_Alexandre_Valentin

def tok(tex):
    """
		Tag le texte et renvoie le texte taggé sous format facile à lire pour l'utilisateur en colonnes
		Entrée: texte
		Output: texte taggée
	"""
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
    tags = tagger.tag_text(tex)
    tags2 = treetaggerwrapper.make_tags(tags)
    pprint.pprint(tags2)
    empty = []
    for tag in tags2:
        tagg = tag
        empty.append(tagg)
        print(empty)

    grammar = []
    for element in empty:
        for i in element:
            #			if element.index(i)==0 or element.index(i)==1:
            grammar.append(i)
            grammar.append("\t")
        grammar.append("\n")
        res = "".join(grammar)

    return "{}".format(res)

Пример #16

0

Показать файл

Файл: tagging_features.py Проект: AutoDataPlatform/python

def pos_tag(inp: str, out, tagdir: str = '/usr/local/tree-tagger'):
    #Generates POS representation of data and pickles output into a jar.
    texts, genders, ages = read_data(inp)
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',
                                          TAGDIR=tagdir,
                                          TAGOPT='-token -sgml')
    logging.info('POS tagging data')

    pos_texts = []
    d_infothresholds = {
        int((i / 100.0 * len(texts))): "%i%%" % (i)
        for i in range(0, 101)
    }
    for i, t in enumerate(texts):
        tags = [
            el.split('\t')[1] for el in tagger.tag_text(t)
            if len(el.split()) == 2
        ]
        pos_texts.append(' '.join(tags))
        if i in d_infothresholds.keys():
            logging.info('{} of documents processed'.format(
                d_infothresholds[i]))

    logging.info('Pickling results to {}'.format(out.name))
    pickle.dump((pos_texts, genders, ages), out)

Пример #17

0

Показать файл

Файл: degender.py Проект: dcleung/w2v-gender

def process_files(source_path,
                  target_path,
                  languages=['en'],
                  exception_file=None):
    source_path = Path(source_path)
    target_path = Path(target_path)
    for l in languages:
        tagger = treetaggerwrapper.TreeTagger(TAGLANG=l)
        ipath = Path(l) / '**/*.gz'
        paths = list(source_path.glob(str(ipath)))
        exceptions = None
        if exception_file:
            with open(exception_file) as f:
                exceptions = json.loads(f.read())[l]
                if isinstance(exceptions, dict):
                    exception_terms = []
                    for k in exceptions:
                        exception_terms += exceptions[k]
                    exceptions = exception_terms
        for f_i, path in tqdm(enumerate(paths), desc=l, total=len(paths)):
            with gzip.open(path, 'rt') as source:
                target_file = target_path / path.relative_to(source_path)
                os.makedirs(target_file.parent, exist_ok=True)

                with gzip.open(target_file, 'wt') as target:
                    for line in source:
                        if line.startswith('\n') or line.startswith(
                                '\t') or line.startswith(
                                    ' ') or line.startswith('<'):
                            continue
                        p = ' '.join(process(line, l, tagger, exceptions))

                        target.write(p + '\n')

Пример #18

0

Показать файл

Файл: corpus_preparation.py Проект: TeresaKa/lyrik_prosa_stilometrie

def lemma(inpath, outpath, charFilter):
    ''' Lemmatisiert Texte in gegebenem Ordner inpath. '''
    for text in os.listdir(inpath):
        if text.endswith('.txt'):
            f_lemma = []
            result = ''
            t = open(inpath + '/' + text, 'r')
            f = t.read()
            tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')
            tags = tagger.tag_text(f)
            tags2 = treetaggerwrapper.make_tags(tags)
            print("text", text)
            for t in tags2:
                try:
                    result += t.lemma
                    result += ' '
                except:
                    pass
            f_lemma.append(result)
            if os.path.exists(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt'):
                txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'w')
                txtFile.write('')
                txtFile.close()
            txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'a')
            for i in f_lemma:
                txtFile.write(replace(i, charFilter + ' '))
            txtFile.close()
    return

Пример #19

0

Показать файл

Файл: indexerDocuments.py Проект: AlexandrWeber/Projet_RI

def taggerTexte(texte):
    """
    Normalise le texte et renvoie les lemmes
    
    Arguments:
        Texte
        
    Renvoie:
        liste des lemmes pertinents
    """
    texxt = texte.split("’")
    tex = "'".join(texxt)

    if detect(tex) == "fr":
        langdet = 'fr'
    if detect(tex) == 'en':
        langdet = 'en'
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=langdet)
    tags = tagger.tag_text(tex)
    tags2 = treetaggerwrapper.make_tags(tags)
    #pprint(tags2)
    empty = []
    for tag in tags2:
        tagg = tag
        empty.append(tagg)
        #print(empty)

    grammar = []
    for element in empty:
        compt = 0
        for i in element:
            if compt == 1 or compt == 2:
                grammar.append(i)
                grammar.append("\t")
            compt += 1
        grammar.append("\n")
    del grammar[-1]
    res = "".join(grammar)

    ress = res.split("\n")

    lemmes = []
    for rrr in ress:
        if len(rrr) == 0 or "@" in rrr:
            del ress[ress.index(rrr)]
    for rr in ress:

        match_tag = re.search(r"(.*)\t(.*)\t", rr)

        if "VER" in match_tag.group(1) or "NOM" in match_tag.group(
                1) or "ABR" in match_tag.group(1) or "ADJ" in match_tag.group(
                    1):
            lemmes.append(match_tag.group(2).lower())
        elif "VV" in match_tag.group(1) or "NN" in match_tag.group(
                1) or "NP" in match_tag.group(1) or "JJ" in match_tag.group(
                    1) or "VH" in match_tag.group(
                        1) or "VB" in match_tag.group(
                            1) or "MD" in match_tag.group(1):
            lemmes.append(match_tag.group(2).lower())
    return lemmes

Пример #20

0

Показать файл

    def fit_transform(self):

        # Parsing (lemmatisation and pos-tagging)
        tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
        df['tags'] = self.df[self.text_column].apply(
            lambda x: treetaggerwrapper.make_tags(tagger.tag_text(x)))
        df['lemma'] = df.tags.apply(lambda x: [(t.lemma).lower() if isinstance(
            t, treetaggerwrapper.Tag) else '' for t in x])
        df['text_lemma'] = df.apply(lambda row: " ".join(row.lemma), axis=1)
        df['pos'] = df.tags.apply(
            lambda x:
            [t.pos if isinstance(t, treetaggerwrapper.Tag) else '' for t in x])

        # surface based features
        df['number_verbs'] = df.pos.apply(lambda x: self.number_verbs(x))
        df['number_proper_nouns'] = df.pos.apply(
            lambda x: self.number_proper_nouns(x))
        df['number_imperative_verb'] = df.pos.apply(
            lambda x: self.number_imperative_verb(x))

        # sentiment features
        lex_dict = self.lex_df.to_dict('index')
        list_lex_exp = df.lemma.apply(lambda x: [
            lex_dict[key] if self.has_expression(exp=lex_dict[key]['lemma'],
                                                 string_ls=x,
                                                 group=lex_dict[key]['group'])
            == True else None for key in lex_dict.keys()
        ])
        list_lex_exp = list_lex_exp.apply(lambda x: list(filter(None, x)))
        has_int = self.has_intensifier(list_lex_exp)
        avg_pol = self.avg_polarity(list_lex_exp)

        df['has_intensifier'] = has_int
        df['avg_polarity'] = avg_pol
        return df

Пример #21

0

Показать файл

def analyze_ft_others(outPath, name):
    """
    Calculate readability metrics for fulltexts.

    :param outPath: Raw fulltext directory
    :param name: Name of journal to be processed
    :return: Dataframe with calculated readability metrics for fulltexts in outPath/metrics/

    """

    from functions import readabilityFunctions as rf
    import treetaggerwrapper

    print("Processing fulltexts of journal " + name)

    inPath = outPath + 'pmid/' + name + '.json'
    outPathMetrics = outPath + 'metrics/' + name + '_metrics.json'

    rf.analyze(path=inPath,
               spath=outPathMetrics,
               tagger=treetaggerwrapper.TreeTagger(TAGLANG='en'),
               textType='body',
               columnList={
                   'year', 'pmid', 'doi', 'strippedText', 'wordLength',
                   'wordCount', 'sentenceCount', 'sylCount', 'flesch', 'NDC',
                   'PercDiffWord', 'DiffWord_lst'
               })

Пример #22

0

Показать файл

Файл: text2features.py Проект: theasder/master_thesis_utilities

    def __init__(self, data):
        self.processed_data = []
        self.labeling = []
        nes = ['Name', 'Surname', 'Location']

        counter = 0
        self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='ru')

        # 890 : 920 зависает

        for el in tqdm(data[:890] + data[920:]):
            elem = deepcopy(el)
            lemmas = lemmatize_words(elem['text'])

            elem['text'] = ' '.join(lemmas)
            sents = sent_tokenize(elem['text'])

            for sent in sents:
                tokens, labels = self.word2features(sent, elem)
                self.processed_data.append(tokens)
                self.labeling.append(labels)

            for ne in nes:
                if elem[ne] != []:
                    raise NormalizeError((elem[ne], elem['text'],
                                          self.pos_tagging(elem['text'])))

            counter += 1

Пример #23

0

Показать файл

def get_documents(docs, stopwords):
    """Extrait les documents du corpus
    :param corpus: [(source,datetime,text)]"""

    documents = list()
    corpus = [
        (doc[0], doc[1], doc[2]) for doc in docs if len(doc[2].split()) > 3
    ]  # supprime les lignes courtes de moins de 4 mots

    tagger = tagr.TreeTagger(
        TAGLANG='fr',
        TAGDIR='c:/Applications/TreeTagger',
        TAGPARFILE='C:/Applications/TreeTagger/lib/french-utf8.par')

    idx, start_time = 1, time.time()
    for doc in corpus:
        source, datetime, raw = doc[0], doc[1], doc[2]
        tags = tagr.make_tags(tagger.tag_text(clean_text(raw)))
        tags = [tag for tag in tags if type(tag) == tagr.Tag]
        # add all our elements to the array (documents)
        # each element in the array is a dictionary
        documents.append({
            'idx': idx,
            'source': source,
            'time': datetime,
            'raw': raw,
            'tags': tags
        })
        idx = progress_per(idx, len(corpus),
                           start_time)  # print the progress percentage info
    print()

    return documents

Пример #24

0

Показать файл

    def __init__(self):

        # several attributes for language informations
        self._plainText = ""
        self._filteredText = ""

        self._tokens = []
        self._tokensAndPOS = []
        self._lemmas = []
        self._lemmasWithoutStopwords = []
        self._lemmasAndPOS = []
        self._lemmaAndPOSDict = {}

        self._lemmasWithLanguageInfo = []

        self._lemmasAndPOSAndTokensDict = {}

        self._stopwords = []
        self._stopwords_lemmatized = []

        self._currentDramaName = ""
        self._tokensWithoutStopwords = []

        self._stopwordLists = [
            "standardList", "enhancedList", "enhancedFilteredList"
        ]

        self._tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

Пример #25

0

Показать файл

Файл: verb-analysis.py Проект: maravanderploeg/ParticleDetector

def get_tagger(language: str):
    """
    :param language: language code e.g. de,nl
    :return: tagger object
    """
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=language)
    return tagger

Пример #26

0

Показать файл

def build_tree_tagger(text, source_file, output_path):
    global dir_tree_tagger
    # build a TreeTagger wrapper
    tagger = treetaggerwrapper.TreeTagger(TAGDIR=dir_tree_tagger, TAGLANG="fr")
    # tag text
    tags = tagger.tag_text(text)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    treetaggerwrapper.TreeTagger.tag_file_to(
        tagger, str(source_file), str(output_path / 'tagger_result.txt'))
    # pprint.pprint(tags)
    tags2 = treetaggerwrapper.make_tags(tags)
    # pprint.pprint(tags2)
    tag_dict = dict()
    for tag in tags2:
        if hasattr(tag, 'pos'):
            tag_dict[unicodedata.normalize('NFD',
                                           tag.word).encode('ascii',
                                                            'ignore')] = {
                                                                "pos": tag.pos,
                                                                "lemma":
                                                                tag.lemma
                                                            }
    # pprint.pprint(tags2)
    return tag_dict, tags2

Пример #27

0

Показать файл

Файл: lemmatize.py Проект: sirdigital/licencjat

def process_file(out_file_name):
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='pl')
    f = open(out_file_name + '.txt', 'r')
    w = open(out_file_name + '_lemmatized.txt', 'w')
    i = 0
    wrong_pos = ['SENT', 'interp']

    for line in f:
        try:
            with timeout(5, exception=RuntimeError):
                tags = tagger.tag_text(line)
                tag_list = []

                tags2 = treetaggerwrapper.make_tags(tags)

                for tag in tags2:
                    if tag.pos not in wrong_pos:
                        tag_list.append(tag.lemma)

                w.write(' '.join(tag_list) + '\n')

                i += 1
                if i % 100:
                    print(i)

        except RuntimeError:
            continue

Пример #28

0

Показать файл

def run_treetagger(text, language):
    """
    Runs treetagger on the text string. 
    Returns a treetagger tagged object. 
    """
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=language)
    tagged = tagger.tag_text(text)
    return tagged

Пример #29

0

Показать файл

def lemmatize_input_files():
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
    files = [f for f in glob.glob("../texts/txt/*.txt")]
    return {
        os.path.basename(f): treetaggerwrapper.make_tags(tagger.tag_file(f),
                                                         exclude_nottags=True)
        for f in files
    }

Пример #30

0

Показать файл

def split_words(path, doc_id=''):
    tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=settings.TTBin)

    tags = tagger.tag_file(path)
    tags = ttw.make_tags(tags)

    return TaggedDocument(
        tags=[doc_id], words=[tag.lemma for tag in tags if tag.pos in NN_list])

Python TreeTagger примеры использования