def pre_processing_pipeline_text(X_train, X_test): x_train_process = [] for text in X_train: tmp = pp.to_lower_case(text) tmp = pp.to_lower_case(tmp) tmp = pp.substitute_thousands(tmp) tmp = pp.fix_common_mistakes(tmp) tmp = pp.unstack(tmp) tmp = pp.remove_white_space(tmp) tmp = pp.remove_punctuation(False, tmp) tmp = pp.clean_text(False, tmp) tmp = pp.stemming(tmp) # TODO try without it x_train_process.append(tmp) x_test_process = [] for text in X_test: tmp = pp.to_lower_case(text) tmp = pp.to_lower_case(tmp) tmp = pp.substitute_thousands(tmp) tmp = pp.fix_common_mistakes(tmp) tmp = pp.unstack(tmp) tmp = pp.remove_white_space(tmp) tmp = pp.remove_punctuation(False, tmp) tmp = pp.clean_text(False, tmp) tmp = pp.stemming(tmp) # TODO try without it x_test_process.append(tmp) return x_train_process, x_test_process
def read_unstructure_texts(files): texts = [] for file in files: with open(file, encoding='utf8') as f: data = f.read() data = stemming(remove_stop_words(tokenize(data))) texts.append(data) return texts
def pre_processing_pipeline(df): for index, row in df.iterrows(): df.loc[index, 'text'] = pp.to_lower_case(row['text']) df.loc[index, 'text'] = pp.substitute_thousands(row['text']) df.loc[index, 'text'] = pp.fix_common_mistakes(row['text']) df.loc[index, 'text'] = pp.unstack(row['text']) df.loc[index, 'text'] = pp.remove_white_space(row['text']) df.loc[index, 'text'] = pp.remove_punctuation(False, row['text']) df.loc[index, 'text'] = pp.clean_text(False, row['text']) df.loc[index, 'text'] = pp.stemming(row['text']) # TODO try without it return df
def read_to_texts(files): texts = [] for file in files: with open(file, encoding='utf8') as f: data = f.read() parsed_result = parse_paper(data) text = '' for section, content in parsed_result['structure'].items(): text += content text = stemming(remove_stop_words(tokenize(text))) texts.append(text) return texts
def load_file(path): texts = [] num_files = 0 num_methods = 0 num_abstract = 0 paper_attributes = {} country_counter = [] for folder in os.listdir(path): for file in os.listdir(os.path.join(path, folder)): num_files += 1 if num_files <= NUM_START: continue parsed_result = file2text(os.path.join(path, folder, file)) method_text = get_methods(parsed_result['text']) if method_text: token_text = stemming(remove_stop_words(tokenize(method_text))) texts.append(token_text) num_methods += 1 author_countries = get_country(parsed_result['text'][:1000]) if len(author_countries) > 1: num_abstract += 1 paper_id = file.split('.')[0] paper_attributes[paper_id] = {'countries': author_countries} for c in author_countries: if c in country_counter: country_counter[c] += 1 else: country_counter[c] = 1 if num_files % 1000 == 0: print(num_files, num_methods, num_abstract) if num_files > NUM_STOP: with open(os.path.join(os.getcwd(), 'metadata', OUTPUT_FILE), 'w') as f: json.dump(paper_attributes, f) print(country_counter) return print("finished extraction") print(num_files, num_methods, num_abstract)
data = json.load(instances) word_list = [] #create an array with all the words tags = [] xy = [] for i in data['data']: tag = i['tag'] tags.append(tag) for user_response in i['user_responses']: normalized = normalization(user_response) words = tokenization(normalized) word_list.extend(words) xy.append( (words, tag)) #array of user responses with the respective tags word_list = [stemming(word) for word in word_list] word_list = sorted(set(word_list)) print(tags) print(word_list) print(xy) x_train = [] y_train = [] for (tokenized, tag) in xy: bag = bag_of_words(tokenized, word_list) x_train.append(bag) tag_label = tags.index(tag) y_train.append(tag_label) x_train = np.array(x_train)
word_list = [] #create an array with all the words tags = [] xy = [] for i in data['data']: tag = i['tag'] tags.append(tag) for user_response in i['user_responses']: normalized = normalization(user_response) words = tokenization(normalized) word_list.extend( words) # not apeend becous dont want a arraylist in a array xy.append( (words, tag)) #array of user responses with the respective tags word_list = [stemming(word) for word in word_list] # to remove the symbols word_list = sorted(set(word_list)) # to remove duplicate elements print(tags) print(word_list) print(xy) # train the data x_train = [] y_train = [] for (tokenized, tag) in xy: bag = bag_of_words(tokenized, word_list) x_train.append(bag) tag_label = tags.index(tag) y_train.append(tag_label) x_train = np.array(x_train)
def tokenize_text(text): token_text = stemming(remove_stop_words(tokenize(text))) filetered_text = remove_lf_words(token_text, 2) return filetered_text