示例#1
0
def realize_pas(pas):
    """
    Produce the sentence realization given a PAS.

    :param pas: PAS to realize.
    :return: realized PAs.
    """
    phrase = ""
    raw_pas = pas.raw_pas

    # Adding spaces to avoid errors like finding "he" in "the"
    # Removing punctuation, just need to find the position of the arguments.
    full_sent = remove_punct(" " + pas.sentence + " ")
    args_positions = []

    # For every value of the SRL dictionary, the position of this value is found in the original sentence and
    # placed in a list which is sorted by position afterwards.
    redundant_modals = ["going", "has", "have", "had"]
    for arg_key in raw_pas.keys():
        arg_val = raw_pas[arg_key]

        # Excluding "not" and modals that might be repeated in the verb fixing process.
        if arg_key != "AM-NEG" and not (arg_key == "AM-MOD"
                                        and arg_val in redundant_modals):
            arg = (" " + remove_punct(arg_val) + " ").replace("  ", " ")
            arg_index = full_sent.find(arg)
            # Verbs has to be fixed as SENNA clears auxiliaries and modals.
            if arg_key == "V":
                arg_val = fix_verb(pas)
            arg_pos = (arg_index, arg_val)
            args_positions.append(arg_pos)

    # Sorting the arguments
    sorted_args = sorted(args_positions, key=lambda tup: tup[0])

    # Building the phrase by spacing the arguments.
    for arg_pos in sorted_args:
        phrase += arg_pos[1] + " "

    # De-spacing the contracted form (I 'm to I'm).
    phrase = re.sub("([a-zA-Z0-9]) \'([a-zA-Z0-9])", r"\1'\2", phrase)
    # De-spacing apices and parentheses (' " " '  to  '" "').
    phrase = re.sub("\" ([a-zA-Z0-9 ,']+) \"", r'"\1"', phrase)
    phrase = re.sub("\( ([a-zA-Z0-9 ,']+) \)", r'"\1"', phrase)
    # De-spacing punctuation.
    phrase = re.sub(" [.,:;] ", r", ", phrase)

    return phrase
示例#2
0
def tag_product(product_title):
    """
	Tag product_title and return core term, brand name, and discriptions.

	Input:
		string: product_title
	Return:
		string: core_term 
		string: brand 
		string: disc
	"""

    ## build a tagger model
    with open(Table_PATH, 'rb') as f:
        tag_table = pickle.load(f)
    tagger = nltk.UnigramTagger(model=tag_table,
                                backoff=nltk.DefaultTagger('D'))

    ## remove punctuations from product title
    product_title_tmp = remove_punct(product_title)
    ## convert plurals to singulars
    wnl = nltk.WordNetLemmatizer()
    product_words = [wnl.lemmatize(s) for s in product_title_tmp.split()]
    clean_title = ' '.join(product_words)

    ## build unigrams, bigrams, trigrams from which product
    ## attributes are to be extracted.
    unigrams = extract_words(product_words)
    bigrams = [' '.join(item) for item in ngrams(unigrams, 2)]
    trigrams = [' '.join(item) for item in ngrams(unigrams, 3)]

    ## Extract attributes from trigrams. If failed, extract from bigrams.
    ## If still failed, extract from unigrams. If still failed, set the
    ## last alpha noun as product core term and leave brand empty.
    core_term, brand = None, None
    core_term, brand = extract_attributes(trigrams, core_term, brand, tagger)
    if not core_term or not brand:
        core_term, brand = extract_attributes(bigrams, core_term, brand,
                                              tagger)
    if not core_term or not brand:
        core_term, brand = extract_attributes(unigrams, core_term, brand,
                                              tagger)
    if not core_term:
        pos_words = nltk.pos_tag(unigrams)
        for word, tag in pos_words[::-1]:
            if tag == 'NN' and word.isalpha():
                core_term = word
                break
    if not brand:
        brand = ''

    ## The words other than the core term and brand name are regarded as
    ## description information.
    try:
        disc = clean_title.replace(core_term, '').replace(brand, '')
        disc = ' '.join(w for w in disc.split())
    except Exception, e:
        info.logging('Cannot find core terms from the product title')
示例#3
0
    def pipeline(self, text):
        text = utils.remove_space(text)
        text = utils.remove_punct(text)
        text = utils.remove_contractions(text.lower(), contractions)
        text = utils.remove_url(text)
        text = utils.remove_html(text)
        text = utils.correct_spellings(text)

        return text
def process_names(file_dir):
    """
	For the brand and product names containing '-', generate
	two names: one take '-' as ' ' and the other as ''

	e.g. 'e-book' --> 'e book' and 'ebook'

	The other punctuations except \. and \' are excluded.
	"""
    results = []
    with open(file_dir, 'rb') as f:
        lines = f.readlines()
        for line in lines:
            if '-' in line:
                line1 = re.sub("-", ' ', line)
                results.append(remove_punct(line1))
                line2 = re.sub("-", '', line)
                results.append(remove_punct(line2))
            else:
                results.append(remove_punct(line))

    return list(set(results))
def preprocess(text):
	delete_list = [",", "’"]
	tweet = utils.delete_characters_space(text, delete_list)
	word_list = tweet.split() 
	word_list = [utils.stem_word(correction.correction( \
                              utils.remove_punct(utils.remove_repeating_char(utils.remove_with_regex(word))))) \
                              for word in word_list]
	word_list = [word for word in word_list if len(word) > 1]
	word_list = utils.remove_words(word_list, STOPWORDS)

	sentence = ""
	for word in word_list:
		sentence = sentence + " " + word

	return(sentence)
示例#6
0
def cure_text(text):
    text = re.sub(r"&=\S+", "", text)
    text = re.sub(r"\[.+?\]", "", text)
    text = re.sub(r"@s:\S+", "", text)

    text = text.replace("+&", "")
    text = text.replace("xxx", "")
    text = text.replace("0", "")
    text = text.replace("&", "")
    text = text.replace("☺", "")
    text = text.replace("▔", "")
    text = text.replace("\n", " ")

    text = remove_punct(text)
    text = text.lower()
    return " ".join(map(lambda x: x.strip(), text.split(" ")))
示例#7
0
def prepare_commonvoice(commonvoice_location, audio_path, text_path,
                        lists_path, processes):
    for f in ['dev', 'test', 'train']:
        dst_list = os.path.join(lists_path, f"commonvoice-{f}.lst")
        dst_text = os.path.join(text_path, f"commonvoice-{f}.txt")
        if not os.path.exists(dst_list):
            to_list = partial(commonvoice_to_list, audio_path, f,
                              commonvoice_location)
            with Pool(processes) as p:
                rows = read_tsv(os.path.join(commonvoice_location, f"{f}.tsv"))
                samples = list(tqdm(
                    p.imap(to_list, rows),
                    total=len(rows),
                ))
            with open(dst_list, "w") as list_f:
                list_f.writelines(samples)

            with open(dst_list, "r") as list_f, open(dst_text, "w") as text_f:
                for line in list_f:
                    text_f.write(" ".join(line.strip().split(" ")[3:]) + "\n")

        else:
            print(f"{dst_list} exists, doing verify")
            new_list = []
            with open(dst_list, "r") as list_f:
                for line in list_f:
                    filename = line.split(" ")[1]
                    text = " ".join(line.strip().split(" ")[3:])
                    params = " ".join(line.strip().split(" ")[:3])
                    text = remove_punct(text)
                    line = f"{params} {text}\n"
                    if not os.path.exists(filename) or len(
                            text) < 2 or not alpha.match(text):
                        print(
                            f"{filename} does not exists or text is empty, text: {text}"
                        )
                    else:
                        new_list.append(line)
            with open(dst_list, "w") as list_f:
                list_f.writelines(new_list)

    print("Prepared CommonVoice", flush=True)
示例#8
0
def fix_verb(pas):
    """
    Fixes the verb by checking on previous verbs/auxiliaries in the original sentence.

    :param pas: PAS containing the verb to be fixed.
    :return: fixed verb.
    """
    raw_pas = pas.raw_pas
    pos = dict(pas.parts_of_speech)
    verb = raw_pas["V"]

    words = remove_punct(pas.sentence).split()
    verb_index = 0
    # Fetching the verb location in the original sentence.
    if verb in words:
        verb_index = words.index(verb)

    # Checking if in 4 words preceding the verb are auxiliaries/modals.
    if verb in pos.keys():
        if pos[verb].startswith("VB"):
            verb_prefix = ""
            for i in range(1, 5):
                if verb_index - i >= 0:
                    if words[verb_index - i] in pos.keys():
                        if pos[words[verb_index - i]].startswith("VB") or words[verb_index - i] == "not" or \
                                words[verb_index - i] == "to":
                            verb_prefix = words[verb_index -
                                                i] + " " + verb_prefix
                        else:
                            break
                else:
                    break
            # Excluding the cases in which the only part added is "to".
            if not (verb_prefix.startswith("to")):
                verb = verb_prefix + verb
    return verb
示例#9
0
def cure_text(text):
    text = remove_punct(text)
    text = text.lower()
    return " ".join(map(lambda x: x.strip(), text.split(" ")))
示例#10
0
def extract_pas(sentences):
    """
    Extracts the PASs from a list of sentences (

    :param sentences: sentences from which to extract PAS.
    """
    # Compute the TFIDF vector of all terms in the document.
    tf_idfs = tf_idf(sentences, os.getcwd() + "/data/idfs.dat")

    # Longest sentence length needed afterwards for the length score.
    longest_sent_len = max(len(sent) for sent in sentences)

    pas_list = []
    for sent in sentences:
        # Ignoring short sentences (errors).
        if 3 < len(remove_punct(sent)) and len(sent) < 1000:
            sent_index = sentences.index(sent)

            # Substituting single apices with double apices to avoid errors with SRL.
            sent = re.sub("\'([a-zA-Z0-9])([a-zA-Z0-9 ]+)([a-zA-Z0-9])\'",
                          r'" \1\2\3 "', sent)

            annotations = _annotator.get_annoations(remove_punct(sent).split())
            # Getting SRL annotations from SENNA.
            sent_srl = annotations['srl']
            # Getting POS tags from SENNA.
            parts_of_speech = annotations['pos']

            for raw_pas in sent_srl:
                accept_pas = 1
                out_of_order = 0
                chk_sent = remove_punct(sent)
                # Rejecting PASs with arguments that change the order (w.r.t. to the one of the original sentence);
                # These represents the 10% of the total PASs and the 80% of them are incorrect.
                for arg in raw_pas.values():
                    # Replacing double spaces with a single space to avoid some arguments to be ignored.
                    arg = remove_punct(arg.replace("  ", " "))

                    if chk_sent.find(arg) < 0:
                        accept_pas = False
                        out_of_order = 1
                        break

                if accept_pas:
                    pas = Pas(sent, parts_of_speech, sent_index,
                              sent_srl.index(raw_pas), raw_pas, out_of_order)
                    pas_list.append(pas)

    # Completing each PAS with its realization embeddings and vector representation.
    # This process is done after the initialization as all the other pas are needed.
    realized_pass = []
    for pas in pas_list:
        realized_pass.append(realize_pas(pas))

    # Here the title is put together with the pass to avoid starting another embedding process
    realized_pass.append(sentences[0])
    pas_embeddings = sentence_embeddings(realized_pass)

    # Get the centrality scores for the pas embeddings
    pas_centralities = centrality_scores(pas_embeddings)

    for pas in pas_list:
        pas_index = pas_list.index(pas)
        pas.complete_pas(
            realized_pass[pas_index], pas_embeddings[pas_index],
            len(sentences), longest_sent_len, tf_idfs,
            pas_centralities[pas_index],
            np.inner(np.array(pas_embeddings[pas_index]),
                     np.array(pas_embeddings[-1])))

    return pas_list
示例#11
0
文件: views.py 项目: skyera/wikodemo
def prefix_suggestion(word):
    temp = remove_punct(word).lower()
    temp = re.split(' ', temp)[0:4]
    suggestion = [' '.join(temp[0:i]) for i in range(1, min(4, 1 + len(temp)))]
    return suggestion
示例#12
0
文件: views.py 项目: skyera/wikodemo
def keynorm(word):
    return remove_punct(word).lower()