Python TextCat 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk.classify.textcat

메소드/함수: TextCat

hotexamples.com에서의 예제들: 7

Python TextCat - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.classify.textcat.TextCat에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: languageIdentification_program.py 프로젝트: FatimaZahrae814/LanguageIdentifier

def detect_language(list):
    """
    using TextCat to detect language an implementation of the text categorization algorithm
    """

    #using textcat to categorize the text
    text_cat = textcat.TextCat()

    #print language of each sentence
    for sentence in list:
        print("the sentence:\n\t '{0}' => is written in {1}".format(
            sentence, text_cat.guess_language(sentence)))

예제 #2

파일 보기

파일: languages.py 프로젝트: travelLynz/homophily_satisfaction

def get_textcat_languages(tbl, col):
    t = tc.TextCat()
    tbl.is_copy = False
    langs = []
    for s in tbl[col]:
        try:
            l = t.guess_language(s)
            langs.append(l)
        except:
            langs.append('unk')
    tbl['textcat_langs'] = langs
    return tbl

예제 #3

파일 보기

파일: graph.py 프로젝트: jindrvo1/ami-summarization

    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
    "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from",
    "up", "down", "in", "out", "on", "off", "over", "under", "again",
    "further", "then", "once", "here", "there", "when", "where", "why", "how",
    "all", "any", "both", "each", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "s", "t", "can", "will", "just", "don", "should", "now"
]

stemmer = snowball.EnglishStemmer()

text_classifier = tc.TextCat()


# create graph where nodes are sentences and edges are present if sentences are similar
def create_graph(sentences, words):
    graph = net.Graph()
    for s in sentences:
        graph.add_node(s.raw_text)
    graph = add_edges(graph, sentences, words)
    return graph


# create an edge in case the similarity of two sentences is above certain threshold
def add_edges(graph, sentences, words):
    for s1 in sentences:
        for s2 in sentences:

예제 #4

파일 보기

# Language detection tools
import fasttext
from langdetect import detect_langs
from polyglot.detect import Detector
from langid.langid import LanguageIdentifier, model
from nltk.classify import textcat
from utility import set_iso_639

# Load module for fasttext
ft_model = fasttext.load_model('lib/lid.176.bin')

# Instiantiate a langid language identifier object
langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Instiantiate a textcat language classifier
tc_cls = textcat.TextCat()


def detect_language(text, guarani=False):
    '''
    return ISO 639-1
    '''
    threshold_confidence = 0.70  # changed because gn,grn,gug is tricky, old 0.75
    lang_detected = defaultdict(int)

    if not text:
        raise Exception('Error!, text is empty.')

    # infer language using fasttext
    try:
        pred_fasttext = ft_model.predict(text, k=1)

예제 #5

파일 보기

파일: serialize_languages.py 프로젝트: SpeciesFileGroup/cs_492_fall

from grpc_client import *
from data_formats import *
from nltk.classify import textcat
import json, re

if __name__ == "__main__":
    f_out = "data_with_languages.json"
    language_classifier = textcat.TextCat()
    journalCollections = dict()

    host = "172.22.247.23:8888"
    with grpc.insecure_channel(host) as channel:
        stub = protob_pb2_grpc.BHLIndexStub(channel)
        for title in Titles(stub):
            text = ""
            text_index = 10
            journal = Journal(title)
            for page in Pages(stub, withText=True, titles=[title.id]):
                if text_index > 0:
                    txt = page.text
                    txt = txt.replace(b'\r', b'')
                    txt = txt.replace(b'\n', b'')
                    txt = txt.decode("ascii", "ignore")
                    text += re.sub(
                        r'[^\w]', ' ',
                        txt)  # Removes all non-alphanumeric characters
                    text += " "
                journal.add_page(page)
                text_index -= 1
            # Classifying the language parameter
            journal.lang = language_classifier.guess_language(text.lower())

예제 #6

파일 보기

파일: imdb.py 프로젝트: JauharulF/sentiment

    sql = """ SELECT id, {}_data FROM sentiment.{} ORDER BY id""".format(
        lang, table)
    cursor.execute(sql)
    rs = cursor.fetchall()
    for row in rs:
        # print('%s (%s) %s' % (row['sentence'], crubadan.iso_to_crubadan(tct.guess_language(row['sentence'])), lid.classify(row['sentence'])))
        sql = """ UPDATE sentiment.{} SET {}_textcat=%s, {}_langid=%s WHERE id=%s """.format(
            table, lang, lang)
        cursor.execute(
            sql, (crubadan.iso_to_crubadan(
                tct.guess_language(row['{}_data'.format(lang)])),
                  lid.classify(row['{}_data'.format(lang)])[0], row['id']))
        conn.commit()
        print('.', end='', flush=True)


tct = textcat.TextCat()
lid = LanguageIdentifier.from_modelstring(model, norm_probs=True)
# lid.set_languages(['en', 'ms'])

dbcon = MySQLdb.connect(host='localhost',
                        user='******',
                        passwd='123456',
                        db='sentiment',
                        cursorclass=MySQLdb.cursors.DictCursor)
# cursor = dbcon.cursor()
# process_lid(dbcon, 'imdb_train', 'en', tct, lid)
# process_lid(dbcon, 'imdb_train', 'ms', tct, lid)
# process_lid(dbcon, 'imdb_test', 'en', tct, lid)
# process_lid(dbcon, 'imdb_test', 'ms', tct, lid)

예제 #7

파일 보기

파일: script.py 프로젝트: tusharma78/LD-AS

    'ko': 'Korean',
    'ar': 'Arabic',
    'zh': 'Chinese (Simplified)',
    'cnr': 'Montenegrin [2]',
    'zh-TW': 'Chinese (Traditional)',
    'ne': 'Nepali',
    'gu': 'Gujarati',
    'ta': 'Tamil',
    'he': 'Hebrew',
    'te': 'Telugu',
    'en': 'English'
}

Text = input(str("Enter the Text: "))

classifier = textcat.TextCat()

distances = classifier.lang_dists(Text)
# #print(input_text)
ans = classifier.guess_language(Text)

# Goslate Language Detector

# gs = goslate.Goslate()
# lan_id = gs.detect(Text)

language = lang_identifier(Text)

txt = '(ISO639-3) Code: ' + language
res = " ".join(line.get(ele, ele) for ele in txt.split())
# print("Detected Language: ",gs.get_languages()[lan_id], res)