Пример #1
0
def __clean(q_text):
    q_text = q_text.replace('?', '')
    q_text = clean_to_nl(q_text)
    q_text = clean_to_train(q_text)
    q_text = clean_pos_tags(q_text)

    return q_text
Пример #2
0
def find_short(coll, threshold, max_amount=-1):
    short_args = []
    for arg in coll.find():
        nl_text = clean_to_nl(Argument.get_text(arg))
        if len(nl_text.split()) <= threshold:
            short_args.append(arg)
            if len(short_args) == max_amount:
                break

    return short_args
Пример #3
0
def get_token_stats(arg):
    nl_text = clean_to_nl(Argument.get_text(arg))
    num_regex = re.compile(NUM_TOKEN)
    percent_regex = re.compile(PERCENT_TOKEN)
    url_regex = re.compile(URL_TOKEN)

    return {
        NUM_TOKEN: len(num_regex.findall(nl_text)),
        PERCENT_TOKEN: len(percent_regex.findall(nl_text)),
        URL_TOKEN: len(url_regex.findall(nl_text)),
    }
Пример #4
0
def init_sents_train(source_coll, destiny_coll, min_arg_length=25, max_args=-1):
    data = []
    for i, arg in tqdm(enumerate(source_coll.find())):
        if i == max_args:
            break

        text = arg['premises'][0]['text']
        nl_text = clean_to_nl(text)
        if len(nl_text.split()) >= min_arg_length:
            data.append({
                '_id': arg['_id'],
                'text': clean_to_sentiment(text),
            })

    destiny_coll.insert_many(data)
Пример #5
0
def init_train(source_coll, destiny_coll, min_arg_length=25, max_args=-1):
    """Erstelle die Trainingsdaten-Komponente für CBOW und BM25"""

    data = []
    for i, arg in tqdm(enumerate(source_coll.find())):
        if i == max_args:
            break

        nl_text = clean_to_nl(arg['premises'][0]['text'])
        if len(nl_text.split()) >= min_arg_length:
            data.append({
                '_id': arg['_id'],
                'text': clean_to_train(nl_text),
            })

    destiny_coll.insert_many(data)