def nlp_term_extraction2(input_dict): ''' Term extraction from totrtale annotations. ''' ws_url = webservice_def_ex_url + "/call" annotations = input_dict['annotations'] lang = input_dict['lang'] stop_list_checkbox = input_dict["stop_list"] == "true" user_stop_words = [] if input_dict['stop_words_file'] != "": user_stop_words = safeOpen(input_dict['stop_words_file']).read() try: user_stop_words.decode("utf-8") except Exception: raise Exception( "Please make sure that your stop words list is encoded in UTF-8." ) user_stop_words = [ word.strip() for word in user_stop_words.split("\n") ] if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations: annotations = TEItoTab(annotations) if lang == "sl": reference_corpus = input_dict["slovene_reference_corpus"] elif lang == "en": reference_corpus = input_dict["english_reference_corpus"] params = { "corpus": annotations, "lang": lang, "reference_corpus": reference_corpus } response = post(ws_url, data=params) resp = json.loads(response.content)[u'callResponse'][u'callResult'] stop_list = [] if stop_list_checkbox: stop_list = get_default_stop_word_list(lang) stop_list = set(stop_list + user_stop_words) if len(stop_list) > 0: resp = resp.split("\n") i = 0 while i < len(resp): increase = True line = resp[i] if len(line) > 0: term = line.split("\t")[1][1:-1] for word in term.split(" "): if word.lower() in stop_list: increase = False resp.pop(i) break if increase: i += 1 resp = "\n".join(resp) return {'candidates': resp}
def load_corpus2(input_dict): ''' Parses an input file and encodes it in base 64. ''' if input_dict[u"text"] == "": f = safeOpen(input_dict['file']) fname = os.path.basename(input_dict['file']) data = base64.b64encode(f.read()) else: fname = "input_string.txt" data = base64.b64encode(input_dict[u"text"].strip()) #define web service webservice_url = webservices_url + "/parseFile" params = {"filename": fname, "text": data} #set params #call web service resp = requests.post(webservice_url, params=params) content = json.loads(resp.content)[u'parseFileResponse'][u'parseFileResult'] if content[u"error"] != "": raise Exception(content[u"error"]) else: return {'corpus': content[u"resp"]}
def load_corpus2(input_dict): """ Parses an input file and encodes it in base 64. """ use_text = input_dict["use_text"] == "true" if use_text: # checkbox is checked fname = "input_string.txt" text = input_dict[u"text"].strip() if len(text) == 0: raise Exception("Please input text or uncheck the Use text checkbox.") data = base64.b64encode(text) else: # checkbox is not checked f = safeOpen(input_dict["file"]) fname = os.path.basename(input_dict["file"]) data = base64.b64encode(f.read()) # define web service webservice_url = webservices_totrtale_url + "/parseFile" params = {"filename": fname, "text": data} # set params # call web service # print webservice_url resp = post(webservice_url, data=params) # print resp.content content = json.loads(resp.content)[u"parseFileResponse"][u"parseFileResult"] """ if content[u"error"] != "": raise Exception(content[u"error"]) else: """ return {"corpus": content[u"resp"]}
def load_tagged_corpus(input_dict): """ Loads a file in TEI or XML format. """ data = "" if input_dict["input_format"] == "tab_format": try: word_index = int(input_dict["word_index"]) - 1 lemma_index = int(input_dict["lemma_index"]) - 1 token_index = int(input_dict["token_index"]) - 1 pos_index = int(input_dict["pos_index"]) - 1 except ValueError: raise Exception("Please specify a number in index fields.") start_tag = input_dict["start_tag"] end_tag = input_dict["end_tag"] separator = input_dict["separator"] if len(start_tag) < 1 or len(end_tag) < 1 or len(separator) < 1: raise Exception("Please review start, end tag and separator parameters.") if ( word_index + 1 == 1 and token_index + 1 == 2 and lemma_index + 1 == 3 and pos_index + 1 == 4 and start_tag == u"<S>" and end_tag == "</S>" ): f = safeOpen(input_dict["file"]) data = f.read() else: if len(set([word_index, lemma_index, token_index, pos_index])) != 4: raise Exception("Field indices should be distinct.") data = parse_tab_separated( input_dict["file"], word_index=word_index, token_index=token_index, lemma_index=lemma_index, pos_index=pos_index, start_tag=start_tag, end_tag=end_tag, separator=separator, ) else: lemma_name = input_dict["lemma_name"] pos_name = input_dict["pos_name"] sentence_tag = input_dict["sentence_tag"] word_tag = input_dict["word_tag"] if len(lemma_name) < 1 or len(pos_name) < 1 or len(sentence_tag) < 1 or len(word_tag) < 1: raise Exception("Please review parameters for TEI format.") data = parse_tei( input_dict["file"], lemma_name=lemma_name, pos_name=pos_name, word_tag=word_tag, sentence_tag=sentence_tag ) return {"annotations": data}
def load_corpus2(input_dict): ''' Parses an input file and encodes it in base 64. ''' use_text = input_dict["use_text"] == "true" if use_text: #checkbox is checked fname = "input_string.txt" text = input_dict[u"text"].strip() if len(text) == 0: raise Exception( "Please input text or uncheck the Use text checkbox.") data = base64.b64encode(text) else: #checkbox is not checked f = safeOpen(input_dict['file']) fname = os.path.basename(input_dict['file']) data = base64.b64encode(f.read()) #define web service webservice_url = webservices_totrtale_url + "/parseFile" params = {"filename": fname, "text": data} #set params #call web service #print webservice_url resp = post(webservice_url, data=params) #print resp.content content = json.loads( resp.content)[u'parseFileResponse'][u'parseFileResult'] """ if content[u"error"] != "": raise Exception(content[u"error"]) else: """ return {'corpus': content[u"resp"]}
def load_to_string(input_dict): ''' Opens the file and reads its contents into a string. ''' f = safeOpen(input_dict['file']) output_dict = {} output_dict['string'] = f.read() return output_dict
def load_to_string(input_dict): ''' Opens the file and reads its contents into a string. ''' f = safeOpen(input_dict['file']) output_dict = {} output_dict['string']=f.read() return output_dict
def load_tagged_corpus(input_dict): """ Loads TEI file, which is output of totrtale """ f = safeOpen(input_dict['file']) #fname = os.path.basename(input_dict['file']) #subprocess.call(["java -jar jing.jar tei_imp.rng " + fname + " >" + "out.txt"],shell=True) data = f.read() return {'annotations': data}
def load_tagged_corpus(input_dict): """ Loads a file in TEI or XML format. """ data = "" if input_dict["input_format"] == "tab_format": try: word_index = int(input_dict["word_index"]) - 1 lemma_index = int(input_dict["lemma_index"]) - 1 token_index = int(input_dict["token_index"]) - 1 pos_index = int(input_dict["pos_index"]) - 1 except ValueError: raise Exception("Please specify a number in index fields.") start_tag = input_dict["start_tag"] end_tag = input_dict["end_tag"] separator = input_dict["separator"] if len(start_tag) < 1 or len(end_tag) < 1 or len(separator) < 1: raise Exception( "Please review start, end tag and separator parameters.") if word_index + 1 == 1 and token_index + 1 == 2 and lemma_index + 1 == 3 and pos_index + 1 == 4 and start_tag == u'<S>' and end_tag == '</S>': f = safeOpen(input_dict['file']) data = f.read() else: if len(set([word_index, lemma_index, token_index, pos_index ])) != 4: raise Exception("Field indices should be distinct.") data = parse_tab_separated(input_dict['file'], word_index=word_index, token_index=token_index, lemma_index=lemma_index, pos_index=pos_index, start_tag=start_tag, end_tag=end_tag, separator=separator) else: lemma_name = input_dict["lemma_name"] pos_name = input_dict["pos_name"] sentence_tag = input_dict["sentence_tag"] word_tag = input_dict["word_tag"] if len(lemma_name) < 1 or len(pos_name) < 1 or len( sentence_tag) < 1 or len(word_tag) < 1: raise Exception("Please review parameters for TEI format.") data = parse_tei(input_dict['file'], lemma_name=lemma_name, pos_name=pos_name, word_tag=word_tag, sentence_tag=sentence_tag) return {'annotations': data}
def file_to_string(input_dict): """ Reads a file and outputs its textual contents """ from workflows.security import safeOpen f = safeOpen(input_dict['file']) output_dict = {} output_dict['string'] = f.read() return output_dict
def load_to_string(input_dict): ''' Opens the file and reads its contents into a string. ''' from workflows.security import safeOpen f = safeOpen(input_dict['file']) output_dict = {} output_dict['string'] = f.read() return output_dict
def load_corpus(input_dict): ''' Parses an input file and encodes it in base 64. ''' f = safeOpen(input_dict['file']) fname = os.path.basename(input_dict['file']) wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl') data = base64.b64encode(f.read()) ws = WebService(wsdl, 60000) response = ws.client.parseFile(fileName=fname, inFile=data) return {'corpus': response['parsedFile']}
def load_corpus(input_dict): """ Parses an input file and encodes it in base 64. """ f = safeOpen(input_dict["file"]) fname = os.path.basename(input_dict["file"]) wsdl = input_dict.get("wsdl", "http://vihar.ijs.si:8095/totale?wsdl") data = base64.b64encode(f.read()) ws = WebService(wsdl, 60000) response = ws.client.parseFile(fileName=fname, inFile=data) return {"corpus": response["parsedFile"]}
def nlp_term_extraction2(input_dict): """ Term extraction from totrtale annotations. """ ws_url = webservice_def_ex_url + "/call" annotations = input_dict["annotations"] lang = input_dict["lang"] stop_list_checkbox = input_dict["stop_list"] == "true" user_stop_words = [] if input_dict["stop_words_file"] != "": user_stop_words = safeOpen(input_dict["stop_words_file"]).read() try: user_stop_words.decode("utf-8") except Exception: raise Exception("Please make sure that your stop words list is encoded in UTF-8.") user_stop_words = [word.strip() for word in user_stop_words.split("\n")] if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations: annotations = TEItoTab(annotations) if lang == "sl": reference_corpus = input_dict["slovene_reference_corpus"] elif lang == "en": reference_corpus = input_dict["english_reference_corpus"] params = {"corpus": annotations, "lang": lang, "reference_corpus": reference_corpus} response = post(ws_url, data=params) resp = json.loads(response.content)[u"callResponse"][u"callResult"] stop_list = [] if stop_list_checkbox: stop_list = get_default_stop_word_list(lang) stop_list = set(stop_list + user_stop_words) if len(stop_list) > 0: resp = resp.split("\n") i = 0 while i < len(resp): increase = True line = resp[i] if len(line) > 0: term = line.split("\t")[1][1:-1] for word in term.split(" "): if word.lower() in stop_list: increase = False resp.pop(i) break if increase: i += 1 resp = "\n".join(resp) return {"candidates": resp}
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator): """ Helper function for load tagged corpus. Function parses tab separated format. """ fname = os.path.basename(path) f = safeOpen(path) data = [] head = "<TEXT title=" + fname + ">\t\n" foot = "</TEXT>\t\n" data.append(head) sentence_counter = 0 for line in f: splitted_line = re.split(separator, line.strip()) if len(splitted_line) >= 4: new_line = ( splitted_line[word_index] + "\t" + splitted_line[token_index] + "\t" + splitted_line[lemma_index] + "\t" + splitted_line[pos_index] + "\t\n" ) data.append(new_line) else: added = False for el in splitted_line: if re.match(start_tag, el.strip()): data.append('\t<S id="0_' + str(sentence_counter) + '">\t\n') added = True break elif re.match(end_tag, el.strip()): data.append("\t</S>\t\n") sentence_counter += 1 added = True break if not added: data.append("\t".join(splitted_line + ["\t\n"])) data.append(foot) return "".join(data)
def streaming_simulate_stream_from_text_file(input_dict,widget,stream=None): import datetime csvfile = safeOpen(input_dict['file']) tweet_data = csvfile.read() tweet_data = tweet_data.strip() tweets = tweet_data.split("\n") ltw = [] i=1 for tw in tweets: tweet = {} tweet['id']=i tweet['created_at']=datetime.datetime.now() tweet['text']=tw tweet['user']="******" tweet['lang']="bg" i=i+1 ltw.append(tweet) output_dict = {} output_dict['ltw']=ltw return output_dict
def streaming_simulate_stream_from_text_file(input_dict, widget, stream=None): import datetime csvfile = safeOpen(input_dict['file']) tweet_data = csvfile.read() tweet_data = tweet_data.strip() tweets = tweet_data.split("\n") ltw = [] i = 1 for tw in tweets: tweet = {} tweet['id'] = i tweet['created_at'] = datetime.datetime.now() tweet['text'] = tw tweet['user'] = "******" tweet['lang'] = "bg" i = i + 1 ltw.append(tweet) output_dict = {} output_dict['ltw'] = ltw return output_dict
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator): """ Helper function for load tagged corpus. Function parses tab separated format. """ fname = os.path.basename(path) f = safeOpen(path) data = [] head = "<TEXT title=" + fname + ">\t\n" foot = "</TEXT>\t\n" data.append(head) sentence_counter = 0 for line in f: splitted_line = re.split(separator, line.strip()) if len(splitted_line) >= 4: new_line = splitted_line[word_index] + "\t" + splitted_line[ token_index] + "\t" + splitted_line[ lemma_index] + "\t" + splitted_line[pos_index] + "\t\n" data.append(new_line) else: added = False for el in splitted_line: if re.match(start_tag, el.strip()): data.append("\t<S id=\"0_" + str(sentence_counter) + "\">\t\n") added = True break elif re.match(end_tag, el.strip()): data.append("\t</S>\t\n") sentence_counter += 1 added = True break if not added: data.append("\t".join(splitted_line + ["\t\n"])) data.append(foot) return "".join(data)
def streaming_simulate_stream_from_csv(input_dict, widget, stream=None): from streams.models import StreamWidgetData import datetime import csv csvfile = safeOpen(input_dict['csv']) csvreader = csv.reader(csvfile, delimiter=";", quotechar='"') rows = [] ltw = [] i = 0 counter = 0 started = False last_id = "not-started-yet" if not stream is None: try: swd = StreamWidgetData.objects.get(stream=stream, widget=widget) last_id = swd.value except: started = True else: started = True for row in csvreader: rows.append(row) if i != 0: rows[i][1] = datetime.datetime.strptime(rows[i][1], "%m/%d/%Y %I:%M:%S %p") tweet = {} tweet['id'] = rows[i][0] tweet['created_at'] = rows[i][1] tweet['text'] = rows[i][3].encode('utf-8') tweet['user'] = rows[i][5].encode('utf-8') tweet['lang'] = rows[i][11] if started: counter = counter + 1 ltw.append(tweet) if counter == 50 and started: started = False if not stream is None: try: swd = StreamWidgetData.objects.get(stream=stream, widget=widget) swd.value = tweet['id'] swd.save() except: swd = StreamWidgetData() swd.stream = stream swd.widget = widget data = tweet['id'] swd.value = data swd.save() if tweet['id'] == last_id: started = True i = i + 1 if counter < 51 and not stream is None and started == True: try: swd = StreamWidgetData.objects.get(stream=stream, widget=widget) swd.value = "done" swd.save() except: swd = StreamWidgetData() swd.stream = stream swd.widget = widget data = "done" swd.value = data swd.save() output_dict = {} #print ltw #print len(ltw) output_dict['ltw'] = ltw return output_dict
def file_to_string(input_dict): f = safeOpen(input_dict['file']) output_dict = {} output_dict['string'] = f.read() return output_dict
def streaming_simulate_stream_from_csv(input_dict,widget,stream=None): from streams.models import StreamWidgetData import datetime import csv csvfile = safeOpen(input_dict['csv']) csvreader = csv.reader(csvfile,delimiter=";",quotechar='"') rows = [] ltw = [] i=0 counter = 0 started = False last_id = "not-started-yet" if not stream is None: try: swd = StreamWidgetData.objects.get(stream=stream,widget=widget) last_id = swd.value except: started = True else: started = True for row in csvreader: rows.append(row) if i!=0: rows[i][1] = datetime.datetime.strptime(rows[i][1],"%m/%d/%Y %I:%M:%S %p") tweet = {} tweet['id'] = rows[i][0] tweet['created_at'] = rows[i][1] tweet['text'] = rows[i][3].encode('utf-8') tweet['user'] = rows[i][5].encode('utf-8') tweet['lang'] = rows[i][11] if started: counter = counter + 1 ltw.append(tweet) if counter == 50 and started: started = False if not stream is None: try: swd = StreamWidgetData.objects.get(stream=stream,widget=widget) swd.value = tweet['id'] swd.save() except: swd = StreamWidgetData() swd.stream = stream swd.widget = widget data = tweet['id'] swd.value = data swd.save() if tweet['id']==last_id: started = True i = i + 1 if counter < 51 and not stream is None and started == True: try: swd = StreamWidgetData.objects.get(stream=stream,widget=widget) swd.value = "done" swd.save() except: swd = StreamWidgetData() swd.stream = stream swd.widget = widget data = "done" swd.value = data swd.save() output_dict = {} #print ltw #print len(ltw) output_dict['ltw']=ltw return output_dict
def file_to_string(input_dict): f = safeOpen(input_dict['file']) output_dict = {} output_dict['string']=f.read() return output_dict