def read_unstructure_texts(files): texts = [] for file in files: with open(file, encoding='utf8') as f: data = f.read() data = stemming(remove_stop_words(tokenize(data))) texts.append(data) return texts
def read_to_texts(files): texts = [] for file in files: with open(file, encoding='utf8') as f: data = f.read() parsed_result = parse_paper(data) text = '' for section, content in parsed_result['structure'].items(): text += content text = stemming(remove_stop_words(tokenize(text))) texts.append(text) return texts
def load_file(path): texts = [] num_files = 0 num_methods = 0 num_abstract = 0 paper_attributes = {} country_counter = [] for folder in os.listdir(path): for file in os.listdir(os.path.join(path, folder)): num_files += 1 if num_files <= NUM_START: continue parsed_result = file2text(os.path.join(path, folder, file)) method_text = get_methods(parsed_result['text']) if method_text: token_text = stemming(remove_stop_words(tokenize(method_text))) texts.append(token_text) num_methods += 1 author_countries = get_country(parsed_result['text'][:1000]) if len(author_countries) > 1: num_abstract += 1 paper_id = file.split('.')[0] paper_attributes[paper_id] = {'countries': author_countries} for c in author_countries: if c in country_counter: country_counter[c] += 1 else: country_counter[c] = 1 if num_files % 1000 == 0: print(num_files, num_methods, num_abstract) if num_files > NUM_STOP: with open(os.path.join(os.getcwd(), 'metadata', OUTPUT_FILE), 'w') as f: json.dump(paper_attributes, f) print(country_counter) return print("finished extraction") print(num_files, num_methods, num_abstract)
def tokenize_text(text): token_text = stemming(remove_stop_words(tokenize(text))) filetered_text = remove_lf_words(token_text, 2) return filetered_text