예제 #1
0
    n_vow_err = vds_err_df['count'].sum() + vi_err_df['count'].sum()
    n_cons_err = cds_err_df['count'].sum() + ci_err_df['count'].sum()

    ## total errors
    n_tot_err = a_df[a_df.ref_char != a_df.out_char]['count'].sum()

    # other errors
    n_oth_err = n_tot_err - (n_vow_err + n_cons_err)

    return (n_vow_err, n_cons_err, n_oth_err, n_tot_err)


if __name__ == '__main__':

    from indicnlp import loader
    loader.load()

    #reffname=sys.argv[1]
    #outfname=sys.argv[2]
    #tgtlang=sys.argv[3]
    #outdir=sys.argv[4]

    #if not os.path.exists(outdir):
    #    print outdir
    #    os.mkdir(outdir)

    #save_analysis_artifacts(reffname, outfname, tgtlang, outdir)

    #a_df=read_align_count_file('/home/development/anoop/experiments/multilingual_unsup_xlit/results/sup/news_2015_official/2_multilingual/onehot_shared/multi-conf/outputs/022_analysis_en-bn/alignment_count.csv')
    #print char_error_rate(a_df)
    #print vowel_error_rate(a_df,'bn')
def create_moses_factored_run_params(conf_template_fname,conf_fname,workspace_dir,parallel_corpus,lm_file,factored_lm_dir,src_lang,tgt_lang): 
    with codecs.open(conf_fname,'w','utf-8') as conf_file: 
        conf_template=''.join(read_lines(conf_template_fname))
        conf=conf_template.format(workspace_dir=workspace_dir,parallel_corpus=parallel_corpus,lm_file=lm_file,factored_lm_dir=factored_lm_dir,src_lang=src_lang,tgt_lang=tgt_lang)
        conf_file.write(conf)

def create_moses_ini_params(ini_template_fname,ini_fname,numfeatures,phrasetable,lmfname,lmorder): 
    initfeatvalues=' '.join(['0.2']*numfeatures)
    with codecs.open(ini_fname,'w','utf-8') as ini_file: 
        ini_template=''.join(read_lines(ini_template_fname))
        ini=ini_template.format(numfeatures=numfeatures,phrasetable=phrasetable,lmfname=lmfname,lmorder=lmorder,initfeatvalues=initfeatvalues)
        ini_file.write(ini)

if __name__=='__main__': 
    ### INDIC_NLP_RESOURCES environment variable must be set
    loader.load()

    command=sys.argv[1]
    if command=='create_synthetic_corpus_split': 
        create_synthetic_corpus_split(sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],n_xlit=int(sys.argv[7]),n_tun=int(sys.argv[8]))
    elif command=='create_synthetic_corpus_concatenated': 
        create_synthetic_corpus_concatenated(sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],n_xlit=int(sys.argv[7]),n_tun=int(sys.argv[8]))
    elif command=='create_moses_run_params':        
        create_moses_run_params(*sys.argv[2:])
    elif command=='create_moses_factored_run_params':        
        create_moses_factored_run_params(*sys.argv[2:])
    elif command=='create_moses_ini_params':        
        create_moses_ini_params(sys.argv[2],sys.argv[3],int(sys.argv[4]),sys.argv[5],sys.argv[6],int(sys.argv[7]))
    else: 
        print "Unknown command"
예제 #3
0
def get_split_algo(lang: str,
                   split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]:
    # get default algorithm if requested
    if split_algo == "default":
        # use best algorithm in function of language
        if lang in LANGS_MOSES:
            split_algo = "moses"
        elif lang in LANGS_INDIC:
            split_algo = "indic"
        elif lang in LANGS_GEEZ:
            split_algo = "geez"
        elif lang in LANGS_KHMER:
            split_algo = "khmer"
        elif lang in LANGS_BURMESE:
            split_algo = "burmese"
        else:
            # use Moses by default (which likely will fall-back to English)
            split_algo = "moses"
        logger.info(f" - default algorithm for {lang} is {split_algo}")

    if split_algo == "none" or lang == "TODO":
        logger.info(" - no sentence splitting")
        return lambda line: [line]

    elif split_algo == "moses":
        if lang in LANGS_MOSES:
            lang = LANGS_MOSES[lang]
            logger.info(
                f" - Moses sentence splitter: using rules for '{lang}'")
        else:
            lang = "en"
            logger.info(
                f" - Moses sentence splitter for {lang}: falling back to {lang} rules"
            )
        splitter = SentenceSplitter(language=lang)
        # non_breaking_prefix_file=non_breaking_prefix_file
        return splitter.split

    elif split_algo == "indic":
        # initialize toolkit (apparently not needed for sentence segmentation)
        if INDIC_NLP_RESOURCES:
            logger.info(" - Initialize Indic NLP toolkit")
            indic_common.set_resources_path(INDIC_NLP_RESOURCES)
            indic_loader.load()
        if lang in LANGS_INDIC:
            lang = LANGS_INDIC[lang]
            logger.info(
                f" - Indic sentence splitter: using rules for '{lang}'")
        else:
            lang = "hi"
            logger.info(
                f" - Indic sentence splitter for {lang}: falling back to {lang} rules"
            )

        # setup normalizer
        factory = IndicNormalizerFactory()
        indic_normalizer = factory.get_normalizer(lang)

        def split_indic(line: str) -> tp.Iterable[str]:
            """Split Indian text into sentences using Indic NLP tool."""
            line = indic_normalizer.normalize(line)
            for sent in indic_sent_tok.sentence_split(line, lang=lang):
                yield sent

        return split_indic

    elif split_algo == "laonlp":
        logger.info(f" - LaoNLP sentence splitter applied to '{lang}'")
        return lao_sent_tok

    elif split_algo == "khmer":
        logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'")
        return khm_sent_tok

    elif split_algo == "bodnlp":
        logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'")
        return bod_sent_tok

    elif split_algo == "geez":
        logger.info(
            f" - Ge'ez rule-based sentence splitter applied to '{lang}'")
        return split_geez

    elif split_algo == "burmese":
        logger.info(
            f" - Burmese rule-based sentence splitter applied to '{lang}'")
        return split_burmese

    else:
        logger.error(f"Unknown splitting algorithm {split_algo}")

    return None
예제 #4
0
def main(_):
    #### Load Indic NLP Library ###
    ## Note: Environment variable: INDIC_RESOURCES_PATH must be set
    loader.load()

    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to PTB data directory")

    print('===========  PARAMETERS  ==============')
    print('Data Path: ' + FLAGS.data_path)
    print('Representation: ' + FLAGS.representation)
    print('Language: ' + str(FLAGS.lang))
    print('Corpus Size: ' + str(FLAGS.train_size))
    print('Config: ' + FLAGS.model)
    print('===========  PARAMETERS  ==============')

    raw_data = reader.ptb_raw_data(FLAGS.data_path, FLAGS.lang,
                                   FLAGS.train_size)
    train_data, valid_data, test_data, actual_vocab_size = raw_data
    print('Actual Vocab Size: ' + str(actual_vocab_size))

    ### set parameters
    config = get_config()
    config.vocab_size = actual_vocab_size
    config.lang = FLAGS.lang
    config.representation = FLAGS.representation

    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1
    eval_config.vocab_size = actual_vocab_size
    eval_config.lang = FLAGS.lang
    eval_config.representation = FLAGS.representation

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = PTBModel(is_training=True, config=config)
        with tf.variable_scope("model", reuse=True, initializer=initializer):
            mvalid = PTBModel(is_training=False, config=config)
            mtest = PTBModel(is_training=False, config=eval_config)

        tf.initialize_all_variables().run()

        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)

            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            train_perplexity = run_epoch(session,
                                         m,
                                         train_data,
                                         m.train_op,
                                         verbose=True)
            print("Epoch: %d Train Perplexity: %.3f" %
                  (i + 1, train_perplexity))
            valid_perplexity = run_epoch(session, mvalid, valid_data,
                                         tf.no_op())
            print("Epoch: %d Valid Perplexity: %.3f" %
                  (i + 1, valid_perplexity))

        test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
        print("Test Perplexity: %.3f" % test_perplexity)
    def __init__(self,lang='en'):
        self.lang = lang
        self.stopwords = None
        self.stemmer = None
        self.sentiment_analyzer = None
        self.text_processor = None        
        INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/"        
        common.set_resources_path(INDIC_NLP_RESOURCES)
        self.pos_tagger = None



        if lang == 'hi':
            self.ht = HindiTokenizer.Tokenizer()
            self.sentiment_analyzer = load_learner(path="../model/hi-sentiment")
            self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()]	
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = None
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens
            )
            loader.load()
            train_data = indian.tagged_sents('hindi.pos')
            self.tnt_pos_tagger = tnt.TnT()
            self.tnt_pos_tagger.train(train_data)

        if lang == 'en':
            self.sentiment_analyzer = VS()
            self.stopwords = nltk.corpus.stopwords.words("english")
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = PorterStemmer()
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens

                # corpus from which the word statistics are going to be used 
                # for word segmentation 
                segmenter="twitter", 

                # corpus from which the word statistics are going to be used 
                # for spell correction
                corrector="twitter", 

                unpack_hashtags=True,  # perform word segmentation on hashtags
                unpack_contractions=True,  # Unpack contractions (can't -> can not)
                spell_correct_elong=False,  # spell correction for elongated words

                # select a tokenizer. You can use SocialTokenizer, or pass your own
                # the tokenizer, should take as input a string and return a list of tokens
                tokenizer=SocialTokenizer(lowercase=True).tokenize,

                # list of dictionaries, for replacing tokens extracted from the text,
                # with other expressions. You can pass more than one dictionaries.
                dicts=[emoticons,slang]
            )