예제 #1
0
def read_unstructure_texts(files):
    texts = []
    for file in files:
        with open(file, encoding='utf8') as f:
            data = f.read()
        data = stemming(remove_stop_words(tokenize(data)))
        texts.append(data)
    return texts
예제 #2
0
def read_to_texts(files):
    texts = []
    for file in files:
        with open(file, encoding='utf8') as f:
            data = f.read()
        parsed_result = parse_paper(data)
        text = ''
        for section, content in parsed_result['structure'].items():
            text += content
        text = stemming(remove_stop_words(tokenize(text)))
        texts.append(text)
    return texts
예제 #3
0
def load_file(path):
    texts = []
    num_files = 0
    num_methods = 0
    num_abstract = 0
    paper_attributes = {}
    country_counter = []
    for folder in os.listdir(path):
        for file in os.listdir(os.path.join(path, folder)):
            num_files += 1
            if num_files <= NUM_START:
                continue
            parsed_result = file2text(os.path.join(path, folder, file))
            method_text = get_methods(parsed_result['text'])
            if method_text:
                token_text = stemming(remove_stop_words(tokenize(method_text)))
                texts.append(token_text)
                num_methods += 1
                author_countries = get_country(parsed_result['text'][:1000])
                if len(author_countries) > 1:
                    num_abstract += 1
                    paper_id = file.split('.')[0]
                    paper_attributes[paper_id] = {'countries': author_countries}
                    for c in author_countries:
                        if c in country_counter:
                            country_counter[c] += 1
                        else:
                            country_counter[c] = 1
            if num_files % 1000 == 0:
                print(num_files, num_methods, num_abstract)
            if num_files > NUM_STOP:
                with open(os.path.join(os.getcwd(), 'metadata', OUTPUT_FILE), 'w') as f:
                    json.dump(paper_attributes, f)
                print(country_counter)
                return
    print("finished extraction")
    print(num_files, num_methods, num_abstract)
예제 #4
0
def tokenize_text(text):
    token_text = stemming(remove_stop_words(tokenize(text)))
    filetered_text = remove_lf_words(token_text, 2)
    return filetered_text