INDIC_NLP_LIB_HOME = "indic_nlp_library" INDIC_NLP_RESOURCES = "indic_nlp_resources" import sys from indicnlp import transliterate sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME)) from indicnlp import common common.set_resources_path(INDIC_NLP_RESOURCES) from indicnlp import loader loader.load() from sacremoses import MosesPunctNormalizer from sacremoses import MosesTokenizer from sacremoses import MosesDetokenizer from collections import defaultdict import indicnlp from indicnlp.tokenize import indic_tokenize from indicnlp.tokenize import indic_detokenize from indicnlp.normalize import indic_normalize from indicnlp.transliterate import unicode_transliterate def postprocess(infname, outfname, input_size, lang, common_lang="hi", transliterate=False):
input_array_ann2 = input_df[['ann2']].values.tolist() print("nominal metric: %.3f" % krippendorff_alpha( [sum(input_array_ann1, []), sum(input_array_ann2, [])], nominal_metric, missing_items=missing, convert_items=str)) #print("interval metric: %.3f" % krippendorff_alpha(input_array, interval_metric, missing_items=missing,convert_items=str)) if __name__ == '__main__': parser = SafeConfigParser() config_file = sys.argv[1] parser.read(config_file) common.set_resources_path(parser.get('indic_config', 'indic_resource_path')) NER_executor = Executor(config_file) NER_executor.findPOSTags() NER_executor.findMorphenes() NER_executor.find_suffix_features() NER_executor.mergeModuleOutputs( parser.get('pos_tagger', 'pos_tagger_output'), parser.get('morphessor', 'morpheme_output_file'), parser.get('ner_tag_data', 'ner_word_tags'), parser.get('suffix_files', 'suffix_output_file'), parser.get('crf_learner', 'crf_input_file')) #NER_executor.trainNER() # print(NER_executor.calculateF1Score("./final_crf_output_1")) #NER_executor.calculateKripendorffCoeeficient("./interannotation")
# -*- coding: utf-8 -*- # The path to the local git repo for Indic NLP library INDIC_NLP_LIB_HOME="/Users/Avijit/Documents/nlp_lib" # The path to the local git repo for Indic NLP Resources INDIC_NLP_RESOURCES="/Users/Avijit/Documents/nlp_res" from indicnlp import common common.set_resources_path(INDIC_NLP_RESOURCES) from indicnlp import loader loader.load() from indicnlp.normalize.indic_normalize import IndicNormalizerFactory input_text=u"\u0958 \u0915\u093c" remove_nuktas=False factory=IndicNormalizerFactory() normalizer=factory.get_normalizer("hi",remove_nuktas) output_text=normalizer.normalize(input_text) print output_text print 'Length before normalization: {}'.format(len(input_text)) print 'Length after normalization: {}'.format(len(output_text))
def get_split_algo(lang: str, split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]: # get default algorithm if requested if split_algo == "default": # use best algorithm in function of language if lang in LANGS_MOSES: split_algo = "moses" elif lang in LANGS_INDIC: split_algo = "indic" elif lang in LANGS_GEEZ: split_algo = "geez" elif lang in LANGS_KHMER: split_algo = "khmer" elif lang in LANGS_BURMESE: split_algo = "burmese" else: # use Moses by default (which likely will fall-back to English) split_algo = "moses" logger.info(f" - default algorithm for {lang} is {split_algo}") if split_algo == "none" or lang == "TODO": logger.info(" - no sentence splitting") return lambda line: [line] elif split_algo == "moses": if lang in LANGS_MOSES: lang = LANGS_MOSES[lang] logger.info( f" - Moses sentence splitter: using rules for '{lang}'") else: lang = "en" logger.info( f" - Moses sentence splitter for {lang}: falling back to {lang} rules" ) splitter = SentenceSplitter(language=lang) # non_breaking_prefix_file=non_breaking_prefix_file return splitter.split elif split_algo == "indic": # initialize toolkit (apparently not needed for sentence segmentation) if INDIC_NLP_RESOURCES: logger.info(" - Initialize Indic NLP toolkit") indic_common.set_resources_path(INDIC_NLP_RESOURCES) indic_loader.load() if lang in LANGS_INDIC: lang = LANGS_INDIC[lang] logger.info( f" - Indic sentence splitter: using rules for '{lang}'") else: lang = "hi" logger.info( f" - Indic sentence splitter for {lang}: falling back to {lang} rules" ) # setup normalizer factory = IndicNormalizerFactory() indic_normalizer = factory.get_normalizer(lang) def split_indic(line: str) -> tp.Iterable[str]: """Split Indian text into sentences using Indic NLP tool.""" line = indic_normalizer.normalize(line) for sent in indic_sent_tok.sentence_split(line, lang=lang): yield sent return split_indic elif split_algo == "laonlp": logger.info(f" - LaoNLP sentence splitter applied to '{lang}'") return lao_sent_tok elif split_algo == "khmer": logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'") return khm_sent_tok elif split_algo == "bodnlp": logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'") return bod_sent_tok elif split_algo == "geez": logger.info( f" - Ge'ez rule-based sentence splitter applied to '{lang}'") return split_geez elif split_algo == "burmese": logger.info( f" - Burmese rule-based sentence splitter applied to '{lang}'") return split_burmese else: logger.error(f"Unknown splitting algorithm {split_algo}") return None
def __init__(self,lang='en'): self.lang = lang self.stopwords = None self.stemmer = None self.sentiment_analyzer = None self.text_processor = None INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/" common.set_resources_path(INDIC_NLP_RESOURCES) self.pos_tagger = None if lang == 'hi': self.ht = HindiTokenizer.Tokenizer() self.sentiment_analyzer = load_learner(path="../model/hi-sentiment") self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()] other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = None self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens ) loader.load() train_data = indian.tagged_sents('hindi.pos') self.tnt_pos_tagger = tnt.TnT() self.tnt_pos_tagger.train(train_data) if lang == 'en': self.sentiment_analyzer = VS() self.stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = PorterStemmer() self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons,slang] )