Exemplo n.º 1
0
  def __init__(self, datapath, tree):
    # decision tree get from the learner
    self.DT = tree
    
    if datapath != None:
      test_file = datapath

    if len(sys.argv) <= 1:
      test_file = "../data/origin/test.csv"
    else:
      test_file = sys.argv[1]

    f = open(test_file, "r")
    lines = f.readlines()

    f2 = open("btest_output.csv", "w")

    # get the list of attributes
    readInput  = Datareader(lines)
    attrNames  = readInput.attrNames
    records    = readInput.records
    resultAttr = readInput.resultAttr
    typeList   = readInput.typeList

    helper.process(records, attrNames[:len(attrNames)-1], typeList) 
    self.predict(records, tree, resultAttr)
    for record in records:
      for attr in attrNames:
        val = record[attr]
        if attr == resultAttr:
          val = int(val) 
        f2.write(str(val) + ',')
      f2.write('\n')
Exemplo n.º 2
0
    def __init__(self, datapath, tree):
        # decision tree get from the learner
        self.DT = tree

        if datapath != "None":
            validate_file = datapath
        elif len(sys.argv) <= 1:
            validate_file = "bvalidate.csv"
        else:
            validate_file = sys.argv[1]

        f = open(validate_file, "r")
        lines = f.readlines()

        # get the list of attributes
        readInput = Datareader(lines)
        attrNames = readInput.attrNames
        records = readInput.records
        resultAttr = readInput.resultAttr
        typeList = readInput.typeList

        helper.process(records, attrNames, typeList)
        print
        print "---> Using the validation set, the prediction accuracy is : "
        print(self.validate(records, self.DT, resultAttr))
        print
Exemplo n.º 3
0
  def __init__(self, datapath, tree):
    # decision tree get from the learner
    self.DT = tree

    if datapath != "None":
      validate_file = datapath
    elif len(sys.argv) <= 1:
      validate_file = "bvalidate.csv"
    else:
      validate_file = sys.argv[1]


    f = open(validate_file, "r")
    lines = f.readlines()

    # get the list of attributes
    readInput  = Datareader(lines)
    attrNames  = readInput.attrNames
    records    = readInput.records
    resultAttr = readInput.resultAttr
    typeList   = readInput.typeList

    helper.process(records, attrNames, typeList) 
    print 
    print "---> Using the validation set, the prediction accuracy is : "
    print(self.validate(records, self.DT, resultAttr))
    print 
Exemplo n.º 4
0
    def __init__(self, datapath):

        if datapath != "None":
            train_file = datapath

        elif len(sys.argv) <= 1:
            train_file = "btrain.csv"
            #train_file = "../data/test.csv"
        else:
            train_file = sys.argv[1]

        f = open(train_file, "r")
        lines = f.readlines()
        lines = lines[:16000]

        # get the list of attributes
        readInput = Datareader(lines)
        attrNames = readInput.attrNames
        records = readInput.records
        resultAttr = readInput.resultAttr
        typeList = readInput.typeList
        print typeList

        helper.process(records, attrNames, typeList)

        print "read " + str(len(attrNames)) + " attributes"
        print "read " + str(len(records)) + " records"
        print

        self.tree = self.DTL(records, attrNames, resultAttr)

        print "-----> Printing DNF of the decision tree ..."
        helper.printDNF(self.tree)
        print
Exemplo n.º 5
0
 def process(
     self
 ):  #process have been changed to get the news , process function in helper.py then
     for url in self.list:  #process it into format that bikeso can accept
         resultPage = fetch(url)
         result = loads(resultPage)[self.content]
         yield (result['title'], helper.process(result[self.desc]))
Exemplo n.º 6
0
def main():
    directory = '/Users/jtim/Dropbox/Academic/research/dissertation/research/output/word-length'
    if not os.path.exists(directory):
        os.makedirs(directory)
    by_author = {}
    for author, files in corpora.items():
        by_author[author] = read_files_into_string(files)

    # Transform the authors' corpora into lists of word tokens
    by_author_tokens = {}
    by_author_length_distributions = {}
    for author in by_author:
        tokens = by_author[author].split()

        # Filter out punctuation
        by_author_tokens[author] = ([process(token) for token in tokens
                                                if any(c.isalpha() for c in token)])

        # Get a distribution of token lengths
        token_lengths = [len(token) for token in by_author_tokens[author]]
        plt.ion()
        by_author_length_distributions[author] = nltk.FreqDist(token_lengths)
        by_author_length_distributions[author].plot(9,title=author,color='grey')
        plt.savefig('{}/word-length-{}.png'.format(directory, author))
        plt.ioff()
        plt.close("all")
Exemplo n.º 7
0
def main():
    pure_persian_works = []
    arabic_counter = Counter()
    arabic_vocabulary = set()

    for name in file_names:
        if 'ar.txt' in name:
            with open("{}{}".format(root_dir, name), 'r') as f:
                #### Error, use Counter instead of set? ####
                words = set(process(f.read()).split())
                arabic_counter.update(words)
                arabic_vocabulary.update(words)

    # remove infrequent words
    for word, count in arabic_counter.items():
        if len(
                word
        ) < 3:  # drop one and two letter words; they have high probablity of being a homograph
            arabic_vocabulary.remove(word)
        elif count <= 2:
            arabic_vocabulary.remove(word)

    with open("{}pure-persian.txt".format(directory), 'w') as out_file:
        for name in file_names:
            if 'fa.txt' in name:
                with open("{}{}".format(root_dir, name), 'r') as f:
                    #### Error, use Counter instead of set? ####
                    words = set(process(f.read()).split())
                    if len(words - arabic_vocabulary) > len(
                            words
                    ) * .70:  # if x percent of words not in Arabic vocabulary
                        out_file.write(name + "\n")
                        out_file.write("Number of words: {}\n".format(
                            len(words)))
                        out_file.write("Percent Persian words: {}\n".format(
                            len(words - arabic_vocabulary) / len(words)))
                        out_file.write(
                            "Words of possible Arabic origin: {}\n".format(
                                words.intersection(arabic_vocabulary)))
                        out_file.write("------------------------------\n\n")
                        pure_persian_works.append(name)

        out_file.write("Pure Persian Works:\n")
        for i in pure_persian_works:
            out_file.write("\t" + "-" + i + "\n")
Exemplo n.º 8
0
def chi_squared(relative_corpus, authors=[]):
    by_author = {}
    for key, files in corpora.items():
        by_author[key] = read_files_into_string(files)

    # Transform the authors' corpora into lists of word tokens
    by_author_tokens = {}
    by_author_length_distributions = {}
    for author in by_author:
        tokens = by_author[author].split()

        # Filter out punctuation
        by_author_tokens[author] = ([
            process(token) for token in tokens
            if any(c.isalpha() for c in token)
        ])

    for author in authors:

        # First, build a joint corpus and identify the 500 most frequent words in it
        joint_corpus = (by_author_tokens[author] +
                        by_author_tokens[relative_corpus])
        joint_freq_dist = nltk.FreqDist(joint_corpus)
        most_common = list(joint_freq_dist.most_common(500))

        # What proportion of the joint corpus is made up of the candidate
        # author's tokens?
        author_share = (len(by_author_tokens[author]) / len(joint_corpus))

        # Now, let's look at the 500 most common words in the candidate
        # author's corpus and compare the number of times they can be observed
        # to what would be expected if the author's writings and the relative
        # corpus were both random samples from the same distribution.
        chisquared = 0
        for word, joint_count in most_common:

            # How often do we really see this common word?
            author_count = by_author_tokens[author].count(word)
            relative_count = by_author_tokens[relative_corpus].count(word)

            # How often should we see it?
            expected_author_count = joint_count * author_share
            expected_joint_count = joint_count * (1 - author_share)

            # Add the word's contribution to the chi-squared statistic
            chisquared += ((author_count - expected_author_count) *
                           (author_count - expected_author_count) /
                           expected_author_count)

            chisquared += ((relative_count - expected_joint_count) *
                           (relative_count - expected_joint_count) /
                           expected_joint_count)

        print("The Chi-squared statistic for", author, "compared to",
              relative_corpus, "is", chisquared)
def main():
    persian_words = ['خواهد', 'بايد', 'نزد', 'نماييد', 'اين', 'نموده', 'شما', 'اوست', 'بگو', 'شود', 'خود', 'هست', 'گشت', 'شويد', 'راه', 'بآن', 'امروز', 'نمايد', 'چون', 'شوند', 'دوستان', 'شده', 'بوده', 'آنكه', 'بود', 'آفتاب', 'اند', 'داده', 'فردا', 'شايد', 'چه', 'نيست', 'را' , 'آنچه’, 'شود’, ’آنچه', 'مانده', 'بيني', 'جان', 'باز', 'اگر', 'است', 'آمد', 'كنيد', 'سرا', 'نما',  'ميشود', 'نمود', 'دار', 'نبوده', 'شوي', 'ميفرمايد', 'دوست']
    corrupt = []
    pure_persian_works = []
    arabic_counter = Counter()
    arabic_vocabulary = set()
    arabic_works_count = 0
    for name in file_names:
        if 'ar.txt' in name:
            corrupt_word_count = 0
            arabic_works_count += 1
            with open(name, 'r') as f:
                words = set(process(f.read()).split())
                arabic_counter.update(words)
                arabic_vocabulary.update(words)
                for word in words:
                    if word in persian_words:
                        corrupt_word_count += 1
                if corrupt_word_count > 0:
                    corrupt.append(name)
    print(corrupt)
    print("{} of {} are corrupted".format(len(corrupt), arabic_works_count))
Exemplo n.º 10
0
    shared_access_key_value='ak9L18tmI2FssJBIZLz3OCs8U55rcYZaSbwgAR6/B34=')



for email in emails:
	try:
		# Split and Parse emails
		# Get Contents
		# Save to db
		doc = {}
		doc['from'] = fx.getContact(email.sender)
		doc['to'] = fx.getContacts(email.to_recipients)
		if email.cc_recipients:
			doc['cc'] = fx.getContacts(email.cc_recipients) 
		doc['subject'] = email.subject.strip()
		msgs = fx.process(email.text_body)
		msgs = [[doc['subject']]] + msgs
		doc['emails'] = msgs
		doc['intent'] = fx.getIntent(msgs)
		#doc['intentions'] = fx.getIntentPerLine(msgs)
		doc['caseid'] = str(uuid.uuid4())
		doc['state'] = 'new'
		doc['handoff'] = False
		doc['botHasReplied'] = False
		#print docid
		
		#Send Message to Incoming Queue
		event = Message(json.dumps(doc))
		nttBus.send_queue_message('htn.incoming.emails', event)
		docid = sdmails.insert_one(doc).inserted_id
	except Exception,e:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 30 14:59:44 2020

@author: qinzhen
"""
import pickle
from helper import process

#读取单词
en, vocab_en = process("corpus.en", True)
de, vocab_de = process("corpus.de")

#t存储概率,t1存储e对应的全体f
t = dict()
t1 = dict()
for e in vocab_en:
    t[e] = dict()
    #存储键
    t1[e] = set()

#迭代
for i, sentence in enumerate(en):
    for e in sentence:
        t1[e] = t1[e].union(set(de[i]))

for e in t:
    ne = len(t1[e])
    for f in t1[e]:
        t[e][f] = 1.0 / ne
Exemplo n.º 12
0
def main():
    directory = '/Users/jtim/Dropbox/Academic/research/dissertation/research/output/'
    if not os.path.exists(directory):
        os.makedirs(directory)
    by_author = {}
    for author, files in corpora.items():
        by_author[author] = read_files_into_string(files)

    authors = ("Bahá'u'llah", "Bahá'u'llah Baghdad", "`Abdu'l-Bahá", "the Báb", "al-Shaykh Murtaḍá al-Ánsárí")

    # Transform the authors' corpora into lists of word tokens
    by_author_tokens = {}
    by_author_length_distributions = {}
    for author in by_author:
        tokens = by_author[author].split()

        # Filter out punctuation
        by_author_tokens[author] = ([process(token) for token in tokens
                                                if any(c.isalpha() for c in token)])



        # Get a distribution of token lengths
        token_lengths = [len(token) for token in by_author_tokens[author]]
        plt.ion()
        by_author_length_distributions[author] = nltk.FreqDist(token_lengths)
        by_author_length_distributions[author].plot(15,title=author,color='grey')
        plt.savefig('{}figures/word-length-{}.png'.format(directory, author))
        plt.ioff()
        plt.close("all")

    ### Chi-squared tests ###

    # Test 1: Compare the distance of Bahá'u'lláh's full corpus to His Baghdad writings
    # and the distance of Shaykh Murtaḍá's writings to Bahá'u'lláh's Baghdad writings
    authors_one = ("Bahá'u'llah", "al-Shaykh Murtaḍá al-Ánsárí")

    for author in authors_one:

        # First, build a joint corpus and identify the 500 most frequent words in it
        joint_corpus = (by_author_tokens[author] + by_author_tokens["Bahá'u'llah Baghdad"])
        joint_freq_dist = nltk.FreqDist(joint_corpus)
        most_common = list(joint_freq_dist.most_common(500))

        # What proportion of the joint corpus is made up of the candidate
        # author's (Bahá'u'llah and Shaykh Murtaḍá) tokens?
        author_share = (len(by_author_tokens[author])
                        / len(joint_corpus))

        # Now, let's look at the 500 most common words in the candidate
        # author's (Bahá'u'llah and Shaykh Murtaḍá) corpus and compare
        # the number of times they can be observed to what would be expected
        # if the author's writings and Bahá'u'llah's Baghdad writings were
        # both random samples from the same distribution.
        chisquared = 0
        for word, joint_count in most_common:

            # How often do we really see this common word?
            author_count = by_author_tokens[author].count(word)
            baghdad_count = by_author_tokens["Bahá'u'llah Baghdad"].count(word)

            # How often should we see it?
            expected_author_count = joint_count * author_share
            expected_joint_count = joint_count * (1-author_share)

            # Add the word's contribution to the chi-squared statistic
            chisquared += ((author_count-expected_author_count) *
                           (author_count-expected_author_count) /
                           expected_author_count)

            chisquared += ((baghdad_count-expected_joint_count) *
                           (baghdad_count-expected_joint_count)
                           / expected_joint_count)

        print("The Chi-squared statistic for", author, "compared to Bahá'u'llah's Baghdad writngs is", chisquared)

        # Test 2: Now consider the relative distance between the writings or `Abdu'l-Bahá`
        # and the writings of Bahá'u'lláh compared to the writings of Shaykh Murtaḍá.
        authors_two = ("`Abdu'l-Bahá", "al-Shaykh Murtaḍá al-Ánsárí")

        for author in authors_two:

            # First, build a joint corpus and identify the 500 most frequent words in it
            joint_corpus = (by_author_tokens[author] + by_author_tokens["Bahá'u'llah"])
            joint_freq_dist = nltk.FreqDist(joint_corpus)
            most_common = list(joint_freq_dist.most_common(500))

            # What proportion of the joint corpus is made up of the candidate
            # author's (`Abdu'l-Bahá` and Shaykh Murtaḍá) tokens?
            author_share = (len(by_author_tokens[author])
                            / len(joint_corpus))

            # Now, let's look at the 500 most common words in the candidate
            # author's (`Abdu'l-Bahá` and Shaykh Murtaḍá) corpus and compare
            # the number of times they can be observed to what would be expected
            # if the author's writings and Bahá'u'llah's writings were both
            # random samples from the same distribution.
            chisquared = 0
            for word, joint_count in most_common:

                # How often do we really see this common word?
                author_count = by_author_tokens[author].count(word)
                baghdad_count = by_author_tokens["Bahá'u'llah"].count(word)

                # How often should we see it?
                expected_author_count = joint_count * author_share
                expected_joint_count = joint_count * (1-author_share)

                # Add the word's contribution to the chi-squared statistic
                chisquared += ((author_count-expected_author_count) *
                               (author_count-expected_author_count) /
                               expected_author_count)

                chisquared += ((baghdad_count-expected_joint_count) *
                               (baghdad_count-expected_joint_count)
                               / expected_joint_count)

            print("The Chi-squared statistic for", author, "compared to Bahá'u'llah's writngs is", chisquared)
Exemplo n.º 13
0
def read_files_into_string(filenames):
    strings = []
    for filename in filenames:
        with open(filename) as f:
            strings.append(process(f.read()))
    return '\n'.join(strings)
Exemplo n.º 14
0
def main():
    directory = '/Users/jtim/Dropbox/Academic/sources/corpora/bahai-corpus/output/islamicate-texts/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Islamicate variables
    arabic_islamicate_counter = Counter()
    persian_islamicate_counter = Counter()
    arabic_islamicate_vocabulary = set()
    persian_islamicate_vocabulary = set()
    arabic_islamicate_word_count = 0
    persian_islamicate_word_count = 0
    arabic_islamicate_vocabularies = []
    persian_islamicate_vocabularies = []

    # Baha'i variables
    arabic_bahai_counter = Counter()
    persian_bahai_counter = Counter()
    arabic_bahai_vocabulary = set()
    persian_bahai_vocabulary = set()
    arabic_bahai_word_count = 0
    persian_bahai_word_count = 0
    arabic_bahai_vocabularies = []
    persian_bahai_vocabularies = []

    # Process Islamicate files
    for a_file in arabic_islamicate_files:
        with open(a_file, 'r') as af:
            words = Counter(process(af.read()).split())
            arabic_islamicate_word_count += len(words)
            arabic_islamicate_vocabulary.update(words)
            arabic_islamicate_counter.update(words)

    for p_file in persian_islamicate_files:
        with open(p_file, 'r') as pf:
            words = Counter(process(pf.read()).split())
            persian_islamicate_word_count += len(words)
            persian_islamicate_vocabulary.update(words)
            persian_islamicate_counter.update(words)

    # Process Baha'i files
    for a_file in arabic_bahai_files:
        with open(a_file, 'r') as af:
            words = Counter(process(af.read()).split())
            arabic_bahai_word_count += len(words)
            arabic_bahai_vocabulary.update(words)
            arabic_bahai_counter.update(words)

    for p_file in persian_bahai_files:
        with open(p_file, 'r') as pf:
            words = Counter(process(pf.read()).split())
            persian_bahai_word_count += len(words)
            persian_bahai_vocabulary.update(words)
            persian_bahai_counter.update(words)

    # Common variables
    minimum_threshold = 5
    for i in range(1, minimum_threshold):
        # remove minimum_threshold number of words
        for item in arabic_islamicate_counter.items():
            if item[1] <= i:
                if item[0] in arabic_islamicate_vocabulary:
                    arabic_islamicate_vocabulary.remove(item[0])

        for item in persian_islamicate_counter.items():
            if item[1] <= i:
                if item[0] in persian_islamicate_vocabulary:
                    persian_islamicate_vocabulary.remove(item[0])

        for item in arabic_bahai_counter.items():
            if item[1] <= i:
                if item[0] in arabic_bahai_vocabulary:
                    arabic_bahai_vocabulary.remove(item[0])

        for item in persian_bahai_counter.items():
            if item[1] <= i:
                if item[0] in persian_bahai_vocabulary:
                    persian_bahai_vocabulary.remove(item[0])

    combined_bahai_counter = arabic_bahai_counter + persian_bahai_counter
    bahai_intersection = persian_bahai_vocabulary.intersection(
        arabic_bahai_vocabulary)
    islamicate_intersection = persian_islamicate_vocabulary.intersection(
        arabic_islamicate_vocabulary)

    # Print Islamicate stats
    print("Islamicate language statistics:")
    print("Arabic word count: {}".format(arabic_islamicate_word_count))
    print("Persian word count: {}".format(persian_islamicate_word_count))
    print("Arabic vocabulary: {}".format(len(arabic_islamicate_vocabulary)))
    print("Persian vocabulary: {}".format(len(persian_islamicate_vocabulary)))
    print("intersection: {} ".format(len(islamicate_intersection)))
    print('\n')
    with open('{}islamicate_intersection.txt'.format(directory),
              'w') as out_file:
        for w in islamicate_intersection:
            out_file.write(w + '\n')

    with open('{}islamicate_intersection_sample.txt'.format(directory),
              'w') as out_file:
        sample = random.sample(list(islamicate_intersection),
                               round((len(islamicate_intersection) / 100)) * 2)
        for w in sample:
            out_file.write(w + '\n')

    # Print Baha'i stats
    print("Bahá'í language statistics:")
    print("Arabic word count: {}".format(arabic_bahai_word_count))
    print("Persian word count: {}".format(persian_bahai_word_count))
    print("Arabic vocabulary: {}".format(len(arabic_bahai_vocabulary)))
    print("Persian vocabulary: {}".format(len(persian_bahai_vocabulary)))
    print("intersection: {} ".format(len(bahai_intersection)))
    print('\n')

    with open('{}bahai_intersection.txt'.format(directory), 'w') as out_file:
        for w in bahai_intersection:
            out_file.write(w + '\n')

    with open('{}bahai_intersection_sample.txt'.format(directory),
              'w') as out_file:
        sample = random.sample(list(bahai_intersection),
                               round((len(bahai_intersection) / 100)) * 2)
        for w in sample:
            out_file.write(w + '\n')

    # Plot table
    data = [[4242, 15, 1927, 2], [30000, 2000, 20000, 22000]]

    columns = ("`Abdu'l-Bahá", "Báb", "Bahá'u'lláh", "Shoghi Effendi")
    rows = ["Sample", "Known works"]

    # values = np.arange(0, 2500, 500)
    # value_increment = 1000

    colors = plt.cm.BuPu(np.linspace(0, 0.5, len(rows)))
    n_rows = len(data)

    index = np.arange(len(columns)) + 0.3
    bar_width = 0.4

    for row in range(n_rows):
        plt.bar(index,
                data[row],
                bar_width,
                bottom=y_offset,
                color=colors[row])
        y_offset = y_offset + data[row]
    plot = plt.table(cellText=None,
                     cellColours=None,
                     cellLoc='right',
                     colWidths=None,
                     rowLabels=rows,
                     rowColours=None,
                     rowLoc='left',
                     colLabels=columns,
                     colColours=None,
                     colLoc='center',
                     loc='bottom',
                     bbox=None,
                     edges='closed',
                     **kwargs)
    plot.show()
Exemplo n.º 15
0
arabic_counter = Counter()

for name in corpus:
    if 'mmha1' in name:
        mmha1.append(name)

for file in mmha1:
    with open("{}{}".format(dir, name),
              'r') as f, open('/Users/jtim/Desktop/out.txt', 'w') as out:
        out.write(file)
        out.write('\n')
        out.write("--------------------------------------")
        out.write('\n')
        out.write(f.read())

for file in mmha1:
    with open("{}{}".format(dir, name),
              'r') as f, open('/Users/jtim/Desktop/out.txt', 'w') as out_c:
        words = Counter(process(f.read()).split())
        arabic_counter.update(words)
        out_c.write(f.read())
        out_c.write('\n')
        out_c.write("--------------------------------------")
        out_c.write('\n')
        out_c.write(str(words))

# print(len(mmha1))
# print(len(arabic_counter))
# print(arabic_counter.most_common(10000))
Exemplo n.º 16
0
            L = loads(L)[data][list]
        else:
            L = loads(
                L
            )[data]  # f is a function used to generate data,content is extract data from news and desc is to extract news item
        self.list = map(f, L)  #use map to enumerate
        self.content = content  #save state
        self.desc = desc  #save state

    def process(
        self
    ):  #process have been changed to get the news , process function in helper.py then
        for url in self.list:  #process it into format that bikeso can accept
            resultPage = fetch(url)
            result = loads(resultPage)[self.content]
            yield (result['title'], helper.process(result[self.desc]))


#for test only
if __name__ == '__main__':
    url = 'http://www.imxingzhe.com/api/v4/get_competitions?page=0&limit=500'

    def f(id):
        return 'http://www.imxingzhe.com/api/v4/competition_detail?competition_id=' + str(
            id['id'])

    tmp = AdvanceWeb(url, 'data', f, 'data', 'description')
    result = tmp.process()
    for i in result:
        print(process(BeautifulSoup(i, 'html.parser')))
Exemplo n.º 17
0
    data = json.loads(msg,strict=False)
    print (data)
    print('Incoming Email Loaded')
except Exception as e:
    print ('Error while loading state....Exit-1 : ' + str(e))
    sys.exit(1)



#Fix String data Type
text_body = data['text_body'].encode('ascii',errors = 'ignore').decode()
subject = data['subject'].encode('ascii',errors = 'ignore').decode()
frm = data['from']['name']

#Get Email Data structure and intent
emails = fx.process(text_body,frm,subject)
intent = fx.getIntent(emails)

#Prepare the doc
doc = {}
doc['emails'] = emails
doc['intent'] = intent
if (intent.get('intent')):
    doc['request'] = req[intent['intent']]


#update the collection
result = collection.update_one({'caseid':data['caseid']}, {"$set":doc}, upsert=False)

#send Message to reply queue
url = "https://sd-ui.azurewebsites.net/task/"+data['caseid']
Exemplo n.º 18
0
import helper as app

if __name__ == '__main__':
    app.init()

    app.process()
    
    app.end()
Exemplo n.º 19
0
def file_to_string(filenames):
    strings = []
    for filename in filenames:
        with open(filename) as f:
            strings.append(process(process_kitab(f.read())))
    return '\n'.join(strings)
Exemplo n.º 20
0
    parser.add_argument('--test_path', default='data/eval.iob')
    parser.add_argument('--patience', type=int, default=10)
    parser.add_argument('--number_normalized', type=bool, default=True)
    parser.add_argument('--use_crf', type=bool, default=True)

    args = parser.parse_args()
    use_gpu = torch.cuda.is_available()
    print('use_crf:', args.use_crf)

    if not os.path.exists(args.savedir):
        os.makedirs(args.savedir)

    test_path = '/content/test.iob'
    text = input("Enter the sentence :")
    print(text)
    process(text, test_path)

    eval_path = "evaluation"
    eval_temp = os.path.join(eval_path, "temp")
    if not os.path.exists(eval_temp):
        os.makedirs(eval_temp)
    test_pred_file = eval_temp + '/test_pred.txt'

    # Loading the vocabulary
    model_name = args.savedir + '/' + args.feature_extractor + str(
        args.use_char) + str(args.use_crf)
    word_vocab = WordVocabulary(args.train_path, args.dev_path, args.test_path,
                                args.number_normalized)
    label_vocab = LabelVocabulary(args.train_path)
    alphabet = Alphabet(args.train_path, args.dev_path, args.test_path)