Exemplo n.º 1
0
def remove_duplicate_tokens(input_string):
    refined_phrase_list = []
    new_phrase_list = input_string.split(' ')
    for token in new_phrase_list:
        if token not in refined_phrase_list:
            refined_phrase_list.append(token)
    refined_string = TreebankWordDetokenizer().detokenize(refined_phrase_list)
    refined_string=refined_string.strip()
    return refined_string
Exemplo n.º 2
0
def remove_duplicate_tokens(input_string):
    """Removes duplicate tokens from input string, unless permitted

    :param input_string:
    :return: output string without duplicate tokens unless allowed
    :rtype: str
    """

    refined_phrase_list = []
    new_phrase_list = input_string.split(' ')
    for token in new_phrase_list:
        if token not in refined_phrase_list:
            refined_phrase_list.append(token)
    refined_string = TreebankWordDetokenizer().detokenize(refined_phrase_list)
    refined_string = refined_string.strip()

    # Permitted duplicate tokens restored (for more such tokens, in
    # future it can be dealt by storing in pre-defined resources)
    if "gallus gallus" in input_string \
            and "gallus gallus" not in refined_string:
        refined_string = refined_string.replace("gallus", "gallus gallus")

    return refined_string
def basic_math_changes(inSentence=None):
    new_line = inSentence.lower().encode("ascii",errors="ignore").decode()
    new_line = new_line.replace("=", " equals ")
    new_line = new_line.replace("^", " to the power of ")
    new_line = new_line.replace("+", " plus ")
    new_line = new_line.replace("-", " minus ")
    new_line = new_line.replace("*", " times ")
    new_line = re.sub(r"(\S+)\((.+?)\)",r"\1 of \2",new_line) #handle f(something), ...
    #Other basic math changes here
    new_line = new_line.replace("that's"," that is ")
    new_line = new_line.replace("there's"," there is ")
    new_line = new_line.replace("let's"," let us ")
    new_line = new_line.replace("here's"," here is ")
    new_line = new_line.replace("it's"," it is ")
    new_line = new_line.replace("y'all"," you all ")
    new_line = new_line.replace("can'"," cannot ")
    new_line = new_line.replace("i'd"," i would ")
    new_line = new_line.replace(" im "," i am ")
    new_line = re.sub(r"(pi)[\/]", r" pi over ",new_line)
    new_line = re.sub(r"(pi)[\+]", r" pi plus ",new_line)
    new_line = re.sub(r"(pi)[\-]", r" pi minus ",new_line)
    new_line = re.sub(r"(pi)[\*]", r" pi times ",new_line)
    new_line = re.sub(r"(\d+)(pi)", r"\1pi ",new_line)
    new_line = re.sub(r"(pi)[\^]", r" pi to the power of  ",new_line)

    sent_tokens = word_tokenize(new_line)
    sent_tokens = list(map(lambda x: x if not x == "'ll"  else " will ", sent_tokens))
    sent_tokens = list(map(lambda x: x if not x == "'m"  else " am ", sent_tokens))
    sent_tokens = list(map(lambda x: x if not x == "n't"  else " not ", sent_tokens))
    sent_tokens = list(map(lambda x: x if not x == "'re"  else " are ", sent_tokens))
    sent_tokens = list(map(lambda x: x if not x == "'ve"  else " have ", sent_tokens))
    sent_tokens = list(map(try_number_replace_token,sent_tokens))
    new_line = TreebankWordDetokenizer().detokenize(sent_tokens)
    new_line = re.sub('  +',' ',new_line) #replace multiple spaces with a single space
    new_line = new_line.strip()
    return TreebankWordDetokenizer().detokenize(sent_tokens)
Exemplo n.º 4
0
            for i, tok in enumerate(s):
                if tok == "CHAPTER" or tok == "Chapter":
                    c_indices.append(True)
                    indices.append(i + 1)
            for i in indices:
                if isinstance(deromanize(s[i]), int) or isintance(
                        int(s[i]), int):
                    del (s[i - 1])  # remove chapter
                    del (s[i])  # remove chapter #

            n = TreebankWordDetokenizer().detokenize(s)
        #n = n.replace("CHAPTER", "")
        #n = n.replace("Chapter", "")
        n = n.replace("--", " - ")
        n = n.replace("_", "")
        n = n.strip()

        #n = " ".join(n.split())
        nl.append(n)

    tok_title = tokenizer.tokenize(t.lower())
    output_title = []
    for i, v in enumerate(tok_title):
        output_title.append(v)
    output_title = "".join(output_title)
    output_title = output_title.replace(" ", "_")
    print(output_title)
    with open(output_title, "w", encoding="UTF-8") as f:
        for l in nl:
            f.write(l + "\n")