Пример #1
0
def main():
    results = []
    sents = open('../d2_data/all_sents_spellchecked.txt',
                 'r').read().split('\n')

    caller = pipeline_caller.PipelineCaller()
    tool_name = "morphanalyzer"
    api_token = "sQj6zxcVt7JzWXHNTdRu3QRzc6i8KZz7"

    start_i = 346
    i = 0
    for s in sents[start_i:]:
        try:
            print(s)
            curr_result = []

            for w in s.split():
                r = caller.call(tool_name, w, api_token)
                r = ' '.join(r.split('\n'))
                curr_result.append(r)

            curr_result = '\n'.join(curr_result)
            results.append(
                '<S> <S>+BSTag\n{0}\n</S> </S>+ESTag'.format(curr_result))
            i += 1
            print(curr_result)
        except ConnectionResetError:
            True

        if i % 1000 == 0 or i == len(sents[start_i:]):
            with open('parsed_sents_{0}.txt'.format(i), 'w') as f:
                f.write('\n'.join(results))
Пример #2
0
def main():
    caller = pipeline_caller.PipelineCaller()
    tool_name = "spellcheck"
    api_token = "sQj6zxcVt7JzWXHNTdRu3QRzc6i8KZz7"
    result = ''

    text = open('../d2_data/all_verbs.txt', 'r').read()
    result += caller.call(tool_name, text, api_token)

    with open('../d2_data/all_verbs_spellchecked.txt', 'w') as f:
        f.write(result)
    def module_pipelineNoisy_whole_test(self):

        try:

            caller = pipeline_caller.PipelineCaller(
                'pipelineNoisy', KATANA, os.environ['pipeline_token'], 'whole')

            r = re.compile(r'(\d+)(\t.+?){7,}', re.MULTILINE)

            response = caller.call()

            print(response)
            assert len(re.findall(r, response)) == 33
        except:
            self.fail('Exception thrown')
    def module_Vowelizer_word_test(self):

        try:

            caller = pipeline_caller.PipelineCaller(
                'Vowelizer', KELIME, os.environ['pipeline_token'], 'word')

            r = re.compile(r'(.+?\n)', re.MULTILINE)

            response = caller.call()

            print(response)
            assert len(re.findall(r, response)) == 4
        except:
            self.fail('Exception thrown')
    def module_pipelineNoisy_sentence_test(self):

        try:

            caller = pipeline_caller.PipelineCaller(
                'pipelineNoisy', UCDORT, os.environ['pipeline_token'],
                'sentence')

            r1 = re.compile(r'(1)(\t.+?){7,}', re.MULTILINE)
            r2 = re.compile(r'(5)(\t.+?){7,}', re.MULTILINE)

            response = caller.call()

            print(response)
            assert len(re.findall(r1, response)) == 2 and len(
                re.findall(r2, response)) == 1
        except:
            self.fail('Exception thrown')
Пример #6
0
def main(argv):
    table_name = ''
    token = 'LQiWv0FTmQEJRVbun8Rqld6WZCIrGUyO'
    tool = 'normalize'

    try:
        opts, args = getopt.getopt(argv, "ut:", ["table="])
    except getopt.GetoptError:
        print('normalization.py -t <table_name>')
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-u':
            print('normalization.py -t <table_name>')
            sys.exit()
        elif opt in ("-t", "--table"):
            table_name = arg

    db = PathyDB()
    table_name=table_name.lower()
    table_name=table_name.replace('ç','c')
    table_name=table_name.replace('Ç','c')
    table_name=table_name.replace('ü','u')
    table_name=table_name.replace('Ü','u')
    table_name=table_name.replace('Ö','o')
    table_name=table_name.replace('Ğ','g')
    table_name=table_name.replace('ü','u')
    table_name=table_name.replace('ğ','g')
    table_name=table_name.replace('Ş','s')
    table_name=table_name.replace('ş','s')
    table_name=table_name.replace('ı','i')
    table_name=table_name.replace('İ','i')
    tweets = db.get_all_not_normalized_tweets(table_name)

    for tweet in tweets:
        caller = pipeline_caller.PipelineCaller(tool, unidecode(tweet['tweet_text']), token)
        normalized_text = caller.call()
        print(normalized_text)
        tweet['normalized_text'] = normalized_text
        db.update_tweet(table_name, tweet)
Пример #7
0
def main():
    caller = pipeline_caller.PipelineCaller()
    tool_name = "spellcheck"
    api_token = "sQj6zxcVt7JzWXHNTdRu3QRzc6i8KZz7"
    result = ''

    data = open('../d2_data/query_results_all_joined_sents.csv')
    reader = csv.reader(data)
    sents = []
    indices =[]

    for r in reader:
        sents.append(r[0])
        indices.append(r[2])

    with open('../d2_data/target_indices.txt', 'w') as f:
        f.write('\n'.join(indices))

    for i in range(0, 8):
        text = '\n'.join(sents[i*10000:(i+1)*10000])
        result += caller.call(tool_name, text, api_token)

    with open('../d2_data/all_sents_spellchecked.txt', 'w') as f:
        f.write(result)
Пример #8
0
"""
Send list of word windows to ITU for spellchecking
Heikal Badrulhisham <*****@*****.**>, 2019
"""
import pipeline_caller
import csv
from collections import defaultdict
import pickle
import os

# For calling ITU pipeline
caller = pipeline_caller.PipelineCaller()
tool_name = "spellcheck"
api_token = "sQj6zxcVt7JzWXHNTdRu3QRzc6i8KZz7"

# Dictionary of past spellcheck results
if os.path.isfile('spellcheck_history.pkl'):
    spellcheck_history = pickle.load(open('spellcheck_history.pkl', 'rb'))
else:
    spellcheck_history = defaultdict(str)


def spellcheck(word):
    if word in spellcheck_history:
        return spellcheck_history[word]
    else:
        sc = caller.call(tool_name, word, api_token).replace('\r\n', '')
        spellcheck_history[word] = sc
        return sc

Пример #9
0
def nlpPipeline(text):
    caller = pipeline_caller.PipelineCaller('normalize', text, '')
    result = caller.call(
    )  # call function takes tool name, text, and API access token
    return result
Пример #10
0
 def module_exception_test(self):
     try:
         caller = pipeline_caller.PipelineCaller()
         caller.call("pipelineNoisy", "test sentence", "random token")
     except:
         self.fail("Exception thrown")
Пример #11
0
def nlpPipeline(text):
    text = text.encode('utf8')
    caller = pipeline_caller.PipelineCaller(
        'normalize', text, 'MKHVuqqLiARKHNFq7eEOuOJr54Mncxir', 'whole')
    result = caller.call()
    return result