示例#1
0
def nlp_term_extraction2(input_dict):
    '''
    Term extraction from totrtale annotations.
    '''
    ws_url = webservice_def_ex_url + "/call"
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    stop_list_checkbox = input_dict["stop_list"] == "true"
    user_stop_words = []

    if input_dict['stop_words_file'] != "":
        user_stop_words = safeOpen(input_dict['stop_words_file']).read()
        try:
            user_stop_words.decode("utf-8")
        except Exception:
            raise Exception(
                "Please make sure that your stop words list is encoded in UTF-8."
            )
        user_stop_words = [
            word.strip() for word in user_stop_words.split("\n")
        ]

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)

    if lang == "sl":
        reference_corpus = input_dict["slovene_reference_corpus"]
    elif lang == "en":
        reference_corpus = input_dict["english_reference_corpus"]

    params = {
        "corpus": annotations,
        "lang": lang,
        "reference_corpus": reference_corpus
    }
    response = post(ws_url, data=params)
    resp = json.loads(response.content)[u'callResponse'][u'callResult']

    stop_list = []
    if stop_list_checkbox:
        stop_list = get_default_stop_word_list(lang)
    stop_list = set(stop_list + user_stop_words)

    if len(stop_list) > 0:
        resp = resp.split("\n")
        i = 0
        while i < len(resp):
            increase = True
            line = resp[i]
            if len(line) > 0:
                term = line.split("\t")[1][1:-1]
                for word in term.split(" "):
                    if word.lower() in stop_list:
                        increase = False
                        resp.pop(i)
                        break
            if increase:
                i += 1
        resp = "\n".join(resp)
    return {'candidates': resp}
示例#2
0
def load_corpus2(input_dict):
    '''
    Parses an input file and encodes it in base 64.
    '''

    if input_dict[u"text"] == "":
        f = safeOpen(input_dict['file'])
        fname = os.path.basename(input_dict['file'])
        data = base64.b64encode(f.read())
    else:
        fname = "input_string.txt"
        data = base64.b64encode(input_dict[u"text"].strip())
    
    #define web service
    webservice_url = webservices_url + "/parseFile"
    params = {"filename": fname, "text": data} #set params
    
    #call web service
    resp = requests.post(webservice_url, params=params)
    content = json.loads(resp.content)[u'parseFileResponse'][u'parseFileResult']
    
    if content[u"error"] != "":
        raise Exception(content[u"error"])
    else:
        return {'corpus': content[u"resp"]}
示例#3
0
def load_corpus2(input_dict):
    """
    Parses an input file and encodes it in base 64.
    """
    use_text = input_dict["use_text"] == "true"

    if use_text:  # checkbox is checked
        fname = "input_string.txt"
        text = input_dict[u"text"].strip()
        if len(text) == 0:
            raise Exception("Please input text or uncheck the Use text checkbox.")
        data = base64.b64encode(text)
    else:  # checkbox is not checked
        f = safeOpen(input_dict["file"])
        fname = os.path.basename(input_dict["file"])
        data = base64.b64encode(f.read())

    # define web service
    webservice_url = webservices_totrtale_url + "/parseFile"
    params = {"filename": fname, "text": data}  # set params

    # call web service
    # print webservice_url
    resp = post(webservice_url, data=params)
    # print resp.content
    content = json.loads(resp.content)[u"parseFileResponse"][u"parseFileResult"]
    """
    if content[u"error"] != "":
        raise Exception(content[u"error"])
    else:
    """
    return {"corpus": content[u"resp"]}
示例#4
0
def load_tagged_corpus(input_dict):
    """
    Loads a file in TEI or XML format.
    """
    data = ""

    if input_dict["input_format"] == "tab_format":
        try:
            word_index = int(input_dict["word_index"]) - 1
            lemma_index = int(input_dict["lemma_index"]) - 1
            token_index = int(input_dict["token_index"]) - 1
            pos_index = int(input_dict["pos_index"]) - 1
        except ValueError:
            raise Exception("Please specify a number in index fields.")

        start_tag = input_dict["start_tag"]
        end_tag = input_dict["end_tag"]
        separator = input_dict["separator"]

        if len(start_tag) < 1 or len(end_tag) < 1 or len(separator) < 1:
            raise Exception("Please review start, end tag and separator parameters.")

        if (
            word_index + 1 == 1
            and token_index + 1 == 2
            and lemma_index + 1 == 3
            and pos_index + 1 == 4
            and start_tag == u"<S>"
            and end_tag == "</S>"
        ):
            f = safeOpen(input_dict["file"])
            data = f.read()
        else:
            if len(set([word_index, lemma_index, token_index, pos_index])) != 4:
                raise Exception("Field indices should be distinct.")
            data = parse_tab_separated(
                input_dict["file"],
                word_index=word_index,
                token_index=token_index,
                lemma_index=lemma_index,
                pos_index=pos_index,
                start_tag=start_tag,
                end_tag=end_tag,
                separator=separator,
            )

    else:
        lemma_name = input_dict["lemma_name"]
        pos_name = input_dict["pos_name"]
        sentence_tag = input_dict["sentence_tag"]
        word_tag = input_dict["word_tag"]

        if len(lemma_name) < 1 or len(pos_name) < 1 or len(sentence_tag) < 1 or len(word_tag) < 1:
            raise Exception("Please review parameters for TEI format.")

        data = parse_tei(
            input_dict["file"], lemma_name=lemma_name, pos_name=pos_name, word_tag=word_tag, sentence_tag=sentence_tag
        )

    return {"annotations": data}
示例#5
0
def load_corpus2(input_dict):
    '''
    Parses an input file and encodes it in base 64.
    '''
    use_text = input_dict["use_text"] == "true"

    if use_text:  #checkbox is checked
        fname = "input_string.txt"
        text = input_dict[u"text"].strip()
        if len(text) == 0:
            raise Exception(
                "Please input text or uncheck the Use text checkbox.")
        data = base64.b64encode(text)
    else:  #checkbox is not checked
        f = safeOpen(input_dict['file'])
        fname = os.path.basename(input_dict['file'])
        data = base64.b64encode(f.read())

    #define web service
    webservice_url = webservices_totrtale_url + "/parseFile"
    params = {"filename": fname, "text": data}  #set params

    #call web service
    #print webservice_url
    resp = post(webservice_url, data=params)
    #print resp.content
    content = json.loads(
        resp.content)[u'parseFileResponse'][u'parseFileResult']
    """
    if content[u"error"] != "":
        raise Exception(content[u"error"])
    else:
    """
    return {'corpus': content[u"resp"]}
示例#6
0
def load_to_string(input_dict):
    '''
    Opens the file and reads its contents into a string.
    '''
    f = safeOpen(input_dict['file'])
    output_dict = {}
    output_dict['string'] = f.read()
    return output_dict
示例#7
0
def load_to_string(input_dict):
    '''
    Opens the file and reads its contents into a string.
    '''
    f = safeOpen(input_dict['file'])
    output_dict = {}
    output_dict['string']=f.read()
    return output_dict
示例#8
0
def load_tagged_corpus(input_dict):
    """
    Loads TEI file, which is output of totrtale
    """
    f = safeOpen(input_dict['file'])
    #fname = os.path.basename(input_dict['file'])
    #subprocess.call(["java -jar jing.jar tei_imp.rng " + fname + " >" + "out.txt"],shell=True)
    data = f.read()
    return {'annotations': data}
示例#9
0
def load_tagged_corpus(input_dict):
    """
    Loads a file in TEI or XML format.
    """
    data = ""

    if input_dict["input_format"] == "tab_format":
        try:
            word_index = int(input_dict["word_index"]) - 1
            lemma_index = int(input_dict["lemma_index"]) - 1
            token_index = int(input_dict["token_index"]) - 1
            pos_index = int(input_dict["pos_index"]) - 1
        except ValueError:
            raise Exception("Please specify a number in index fields.")

        start_tag = input_dict["start_tag"]
        end_tag = input_dict["end_tag"]
        separator = input_dict["separator"]

        if len(start_tag) < 1 or len(end_tag) < 1 or len(separator) < 1:
            raise Exception(
                "Please review start, end tag and separator parameters.")

        if word_index + 1 == 1 and token_index + 1 == 2 and lemma_index + 1 == 3 and pos_index + 1 == 4 and start_tag == u'<S>' and end_tag == '</S>':
            f = safeOpen(input_dict['file'])
            data = f.read()
        else:
            if len(set([word_index, lemma_index, token_index, pos_index
                        ])) != 4:
                raise Exception("Field indices should be distinct.")
            data = parse_tab_separated(input_dict['file'],
                                       word_index=word_index,
                                       token_index=token_index,
                                       lemma_index=lemma_index,
                                       pos_index=pos_index,
                                       start_tag=start_tag,
                                       end_tag=end_tag,
                                       separator=separator)

    else:
        lemma_name = input_dict["lemma_name"]
        pos_name = input_dict["pos_name"]
        sentence_tag = input_dict["sentence_tag"]
        word_tag = input_dict["word_tag"]

        if len(lemma_name) < 1 or len(pos_name) < 1 or len(
                sentence_tag) < 1 or len(word_tag) < 1:
            raise Exception("Please review parameters for TEI format.")

        data = parse_tei(input_dict['file'],
                         lemma_name=lemma_name,
                         pos_name=pos_name,
                         word_tag=word_tag,
                         sentence_tag=sentence_tag)

    return {'annotations': data}
示例#10
0
def file_to_string(input_dict):
    """ 
    Reads a file and outputs its textual contents
    """
    from workflows.security import safeOpen

    f = safeOpen(input_dict['file'])
    output_dict = {}
    output_dict['string'] = f.read()
    return output_dict
示例#11
0
def load_to_string(input_dict):
    '''
    Opens the file and reads its contents into a string.
    '''
    from workflows.security import safeOpen

    f = safeOpen(input_dict['file'])
    output_dict = {}
    output_dict['string'] = f.read()
    return output_dict
示例#12
0
def load_corpus(input_dict):
    '''
    Parses an input file and encodes it in base 64.
    '''
    f = safeOpen(input_dict['file'])
    fname = os.path.basename(input_dict['file'])
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
    data = base64.b64encode(f.read())
    ws = WebService(wsdl, 60000)
    response = ws.client.parseFile(fileName=fname, inFile=data)
    return {'corpus': response['parsedFile']}
示例#13
0
def load_corpus(input_dict):
    '''
    Parses an input file and encodes it in base 64.
    '''
    f = safeOpen(input_dict['file'])
    fname = os.path.basename(input_dict['file'])
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
    data = base64.b64encode(f.read())
    ws = WebService(wsdl, 60000)
    response = ws.client.parseFile(fileName=fname, inFile=data)
    return {'corpus': response['parsedFile']}
示例#14
0
def load_corpus(input_dict):
    """
    Parses an input file and encodes it in base 64.
    """
    f = safeOpen(input_dict["file"])
    fname = os.path.basename(input_dict["file"])
    wsdl = input_dict.get("wsdl", "http://vihar.ijs.si:8095/totale?wsdl")
    data = base64.b64encode(f.read())
    ws = WebService(wsdl, 60000)
    response = ws.client.parseFile(fileName=fname, inFile=data)
    return {"corpus": response["parsedFile"]}
示例#15
0
def nlp_term_extraction2(input_dict):
    """
    Term extraction from totrtale annotations.
    """
    ws_url = webservice_def_ex_url + "/call"
    annotations = input_dict["annotations"]
    lang = input_dict["lang"]
    stop_list_checkbox = input_dict["stop_list"] == "true"
    user_stop_words = []

    if input_dict["stop_words_file"] != "":
        user_stop_words = safeOpen(input_dict["stop_words_file"]).read()
        try:
            user_stop_words.decode("utf-8")
        except Exception:
            raise Exception("Please make sure that your stop words list is encoded in UTF-8.")
        user_stop_words = [word.strip() for word in user_stop_words.split("\n")]

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)

    if lang == "sl":
        reference_corpus = input_dict["slovene_reference_corpus"]
    elif lang == "en":
        reference_corpus = input_dict["english_reference_corpus"]

    params = {"corpus": annotations, "lang": lang, "reference_corpus": reference_corpus}
    response = post(ws_url, data=params)
    resp = json.loads(response.content)[u"callResponse"][u"callResult"]

    stop_list = []
    if stop_list_checkbox:
        stop_list = get_default_stop_word_list(lang)
    stop_list = set(stop_list + user_stop_words)

    if len(stop_list) > 0:
        resp = resp.split("\n")
        i = 0
        while i < len(resp):
            increase = True
            line = resp[i]
            if len(line) > 0:
                term = line.split("\t")[1][1:-1]
                for word in term.split(" "):
                    if word.lower() in stop_list:
                        increase = False
                        resp.pop(i)
                        break
            if increase:
                i += 1
        resp = "\n".join(resp)
    return {"candidates": resp}
示例#16
0
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator):
    """
    Helper function for load tagged corpus. Function parses tab separated format.
    """

    fname = os.path.basename(path)
    f = safeOpen(path)

    data = []
    head = "<TEXT title=" + fname + ">\t\n"
    foot = "</TEXT>\t\n"
    data.append(head)

    sentence_counter = 0
    for line in f:
        splitted_line = re.split(separator, line.strip())
        if len(splitted_line) >= 4:
            new_line = (
                splitted_line[word_index]
                + "\t"
                + splitted_line[token_index]
                + "\t"
                + splitted_line[lemma_index]
                + "\t"
                + splitted_line[pos_index]
                + "\t\n"
            )
            data.append(new_line)
        else:
            added = False
            for el in splitted_line:
                if re.match(start_tag, el.strip()):
                    data.append('\t<S id="0_' + str(sentence_counter) + '">\t\n')
                    added = True
                    break
                elif re.match(end_tag, el.strip()):
                    data.append("\t</S>\t\n")
                    sentence_counter += 1
                    added = True
                    break
            if not added:
                data.append("\t".join(splitted_line + ["\t\n"]))
    data.append(foot)
    return "".join(data)
示例#17
0
def streaming_simulate_stream_from_text_file(input_dict,widget,stream=None):
    import datetime
    csvfile = safeOpen(input_dict['file'])
    tweet_data = csvfile.read()
    tweet_data = tweet_data.strip()
    tweets = tweet_data.split("\n")
    ltw = []
    i=1
    for tw in tweets:
        tweet = {}
        tweet['id']=i
        tweet['created_at']=datetime.datetime.now()
        tweet['text']=tw
        tweet['user']="******"
        tweet['lang']="bg"
        i=i+1
        ltw.append(tweet)
    output_dict = {}
    output_dict['ltw']=ltw
    return output_dict
示例#18
0
def streaming_simulate_stream_from_text_file(input_dict, widget, stream=None):
    import datetime
    csvfile = safeOpen(input_dict['file'])
    tweet_data = csvfile.read()
    tweet_data = tweet_data.strip()
    tweets = tweet_data.split("\n")
    ltw = []
    i = 1
    for tw in tweets:
        tweet = {}
        tweet['id'] = i
        tweet['created_at'] = datetime.datetime.now()
        tweet['text'] = tw
        tweet['user'] = "******"
        tweet['lang'] = "bg"
        i = i + 1
        ltw.append(tweet)
    output_dict = {}
    output_dict['ltw'] = ltw
    return output_dict
示例#19
0
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index,
                        start_tag, end_tag, separator):
    """
    Helper function for load tagged corpus. Function parses tab separated format.
    """

    fname = os.path.basename(path)
    f = safeOpen(path)

    data = []
    head = "<TEXT title=" + fname + ">\t\n"
    foot = "</TEXT>\t\n"
    data.append(head)

    sentence_counter = 0
    for line in f:
        splitted_line = re.split(separator, line.strip())
        if len(splitted_line) >= 4:
            new_line = splitted_line[word_index] + "\t" + splitted_line[
                token_index] + "\t" + splitted_line[
                    lemma_index] + "\t" + splitted_line[pos_index] + "\t\n"
            data.append(new_line)
        else:
            added = False
            for el in splitted_line:
                if re.match(start_tag, el.strip()):
                    data.append("\t<S id=\"0_" + str(sentence_counter) +
                                "\">\t\n")
                    added = True
                    break
                elif re.match(end_tag, el.strip()):
                    data.append("\t</S>\t\n")
                    sentence_counter += 1
                    added = True
                    break
            if not added:
                data.append("\t".join(splitted_line + ["\t\n"]))
    data.append(foot)
    return "".join(data)
示例#20
0
def streaming_simulate_stream_from_csv(input_dict, widget, stream=None):
    from streams.models import StreamWidgetData
    import datetime
    import csv
    csvfile = safeOpen(input_dict['csv'])
    csvreader = csv.reader(csvfile, delimiter=";", quotechar='"')
    rows = []
    ltw = []
    i = 0
    counter = 0
    started = False
    last_id = "not-started-yet"
    if not stream is None:
        try:
            swd = StreamWidgetData.objects.get(stream=stream, widget=widget)
            last_id = swd.value
        except:
            started = True
    else:
        started = True
    for row in csvreader:
        rows.append(row)
        if i != 0:
            rows[i][1] = datetime.datetime.strptime(rows[i][1],
                                                    "%m/%d/%Y %I:%M:%S %p")
            tweet = {}
            tweet['id'] = rows[i][0]
            tweet['created_at'] = rows[i][1]
            tweet['text'] = rows[i][3].encode('utf-8')
            tweet['user'] = rows[i][5].encode('utf-8')
            tweet['lang'] = rows[i][11]
            if started:
                counter = counter + 1
                ltw.append(tweet)
            if counter == 50 and started:
                started = False
                if not stream is None:
                    try:
                        swd = StreamWidgetData.objects.get(stream=stream,
                                                           widget=widget)
                        swd.value = tweet['id']
                        swd.save()
                    except:
                        swd = StreamWidgetData()
                        swd.stream = stream
                        swd.widget = widget
                        data = tweet['id']
                        swd.value = data
                        swd.save()
            if tweet['id'] == last_id:
                started = True
        i = i + 1
    if counter < 51 and not stream is None and started == True:
        try:
            swd = StreamWidgetData.objects.get(stream=stream, widget=widget)
            swd.value = "done"
            swd.save()
        except:
            swd = StreamWidgetData()
            swd.stream = stream
            swd.widget = widget
            data = "done"
            swd.value = data
            swd.save()
    output_dict = {}
    #print ltw
    #print len(ltw)
    output_dict['ltw'] = ltw
    return output_dict
示例#21
0
def file_to_string(input_dict):
    f = safeOpen(input_dict['file'])
    output_dict = {}
    output_dict['string'] = f.read()
    return output_dict
示例#22
0
def streaming_simulate_stream_from_csv(input_dict,widget,stream=None):
    from streams.models import StreamWidgetData
    import datetime
    import csv
    csvfile = safeOpen(input_dict['csv'])
    csvreader = csv.reader(csvfile,delimiter=";",quotechar='"')
    rows = []
    ltw = []
    i=0
    counter = 0
    started = False
    last_id = "not-started-yet"
    if not stream is None:
        try:
            swd = StreamWidgetData.objects.get(stream=stream,widget=widget)
            last_id = swd.value
        except:
            started = True
    else:
        started = True
    for row in csvreader:
        rows.append(row)
        if i!=0:
            rows[i][1] = datetime.datetime.strptime(rows[i][1],"%m/%d/%Y %I:%M:%S %p")
            tweet = {}
            tweet['id'] = rows[i][0]
            tweet['created_at'] = rows[i][1]
            tweet['text'] = rows[i][3].encode('utf-8')
            tweet['user'] = rows[i][5].encode('utf-8')
            tweet['lang'] = rows[i][11]
            if started:
                counter = counter + 1
                ltw.append(tweet)
            if counter == 50 and started:
                started = False
                if not stream is None:
                    try:
                        swd = StreamWidgetData.objects.get(stream=stream,widget=widget)
                        swd.value = tweet['id']
                        swd.save()
                    except:
                        swd = StreamWidgetData()
                        swd.stream = stream
                        swd.widget = widget
                        data = tweet['id']
                        swd.value = data
                        swd.save()
            if tweet['id']==last_id:
                started = True
        i = i + 1
    if counter < 51 and not stream is None and started == True:
        try:
            swd = StreamWidgetData.objects.get(stream=stream,widget=widget)
            swd.value = "done"
            swd.save()
        except:
            swd = StreamWidgetData()
            swd.stream = stream
            swd.widget = widget
            data = "done"
            swd.value = data
            swd.save()
    output_dict = {}
    #print ltw
    #print len(ltw)
    output_dict['ltw']=ltw
    return output_dict
示例#23
0
def file_to_string(input_dict):
    f = safeOpen(input_dict['file'])
    output_dict = {}
    output_dict['string']=f.read()
    return output_dict