def __clean(q_text): q_text = q_text.replace('?', '') q_text = clean_to_nl(q_text) q_text = clean_to_train(q_text) q_text = clean_pos_tags(q_text) return q_text
def find_short(coll, threshold, max_amount=-1): short_args = [] for arg in coll.find(): nl_text = clean_to_nl(Argument.get_text(arg)) if len(nl_text.split()) <= threshold: short_args.append(arg) if len(short_args) == max_amount: break return short_args
def get_token_stats(arg): nl_text = clean_to_nl(Argument.get_text(arg)) num_regex = re.compile(NUM_TOKEN) percent_regex = re.compile(PERCENT_TOKEN) url_regex = re.compile(URL_TOKEN) return { NUM_TOKEN: len(num_regex.findall(nl_text)), PERCENT_TOKEN: len(percent_regex.findall(nl_text)), URL_TOKEN: len(url_regex.findall(nl_text)), }
def init_sents_train(source_coll, destiny_coll, min_arg_length=25, max_args=-1): data = [] for i, arg in tqdm(enumerate(source_coll.find())): if i == max_args: break text = arg['premises'][0]['text'] nl_text = clean_to_nl(text) if len(nl_text.split()) >= min_arg_length: data.append({ '_id': arg['_id'], 'text': clean_to_sentiment(text), }) destiny_coll.insert_many(data)
def init_train(source_coll, destiny_coll, min_arg_length=25, max_args=-1): """Erstelle die Trainingsdaten-Komponente für CBOW und BM25""" data = [] for i, arg in tqdm(enumerate(source_coll.find())): if i == max_args: break nl_text = clean_to_nl(arg['premises'][0]['text']) if len(nl_text.split()) >= min_arg_length: data.append({ '_id': arg['_id'], 'text': clean_to_train(nl_text), }) destiny_coll.insert_many(data)