del tmp

    print("\nString pre processing for abstracts: lower and strip")
    sentences = [list(map(str.lower, x)) for x in sentences]
    sentences = [list(map(str.strip, x)) for x in sentences]

    tmp = []
    print(
        "\nString pre processing for abstracts: lemmatize and stop word removal"
    )
    for string_list in tqdm(sentences, total=len(sentences)):
        tmp_list = [
            kw.string_pre_processing(x,
                                     stemming_method='None',
                                     lemmatization='DEF',
                                     stop_word_removal=True,
                                     stop_words_extra=stops,
                                     verbose=False,
                                     download_nltk=False) for x in string_list
        ]
        tmp.append(tmp_list)
    sentences = tmp.copy()
    del tmp
    gc.collect()

    tmp = []
    print("\nString pre processing for abstracts: null word removal")
    for string_list in tqdm(sentences, total=len(sentences)):
        tmp.append([x for x in string_list if x != ''])
    sentences = tmp.copy()
    del tmp
Пример #2
0
label_dict = [[x[0], x[1].replace('@en .', '')]
              for x in tqdm(label_dict.tolist())]
label_dict = [[
    x[0],
    ta.replace_british_american(
        strip_multiple_whitespaces(
            ta.replace_british_american(strip_multiple_whitespaces(x[1]),
                                        kd.gb2us)), kd.gb2us)
] for x in tqdm(label_dict)]
# lemmatization not really needed. Because it already has all versions
label_dict = [[
    x[0],
    ta.string_pre_processing(x[1],
                             stemming_method='None',
                             lemmatization='DEF',
                             stop_word_removal=True,
                             stop_words_extra=stops,
                             verbose=False,
                             download_nltk=False)
] for x in tqdm(label_dict)]

tmp = {}
for row in tqdm(label_dict):
    tmp[row[0]] = row[1]
label_dict = tmp
del tmp

tmp = []
for i, row in tqdm(cso.iterrows(), total=cso.shape[0]):
    a = row['a']
    b = row['b']
        
    print("\nTokenizing")
    tmp = []
    for sentence in tqdm(sentences):
        tmp.append(word_tokenize(sentence))
    sentences = tmp.copy()
    del tmp

    print("\nString pre processing for abstracts: lower and strip")
    sentences = [list(map(str.lower, x)) for x in sentences]
    sentences = [list(map(str.strip, x)) for x in sentences]
    
    tmp = []
    print("\nString pre processing for abstracts: lemmatize and stop word removal")
    for string_list in tqdm(sentences, total=len(sentences)):
        tmp_list = [kw.string_pre_processing(x,stemming_method='None',lemmatization='DEF',stop_word_removal=True,stop_words_extra=stops,verbose=False,download_nltk=False) for x in string_list]
        tmp.append(tmp_list)
    sentences = tmp.copy()
    del tmp
    gc.collect()
    
    tmp = []
    print("\nString pre processing for abstracts: null word removal")
    for string_list in tqdm(sentences, total=len(sentences)):
        tmp.append([x for x in string_list if x!=''])
    sentences = tmp.copy()
    del tmp
    
    print("\nThesaurus matching")
    sentences = kw.thesaurus_matching(sentences,thesaurus_file='data/thesaurus/thesaurus_for_ai_keyword_with_() (training).csv')