示例#1
0
def preprocess_text(article, truncate=False, return_string=False):
    '''
    Cleans up articles by removing page marker junk, 
    unicode formatting, and extra whitespaces; 
    re-joining words split by (hyphenated at) end of line; 
    removing numbers (by default) and acronyms (not by default); 
    tokenizing sentences into words using the Apache Lucene Tokenizer (same as JSTOR); 
    lower-casing words; 
    removing stopwords (same as JSTOR), junk formatting words, junk sentence fragments, 
    and proper nouns (the last not by default).
    
    Args:
        article (str): raw OCR'd academic article text
        truncate (False or integer): whether to keep only first num words from each article (like an abstract)
        return_string (binary): whether to return str (instead of list of str)
        
    Returns:
        str or list of str: each element of list is a word
    '''

    if truncate:
        return_string_temp = False  # need to return tokenized version to count words in article
    else:
        return_string_temp = return_string

    # Remove page marker junk
    article = article.replace('<plain_text><page sequence="1">', '')
    article = re.sub(r'</page>(\<.*?\>)', ' \n ', article)

    article = clean_sentence_apache(article,
                                    unhyphenate=True,
                                    remove_numbers=False,
                                    remove_acronyms=False,
                                    remove_stopwords=False,
                                    remove_propernouns=False,
                                    return_string=return_string_temp)

    if truncate:
        article = article[:
                          truncate]  # keep only num (=truncate) first words in each article

    if truncate and return_string:  # join into string here if not earlier
        article = ' '.join(article)

    return article
示例#2
0
unicode_list = unicode_make()
punctstr = punctstr_make()

print("Stopwords, Unicodes, Punctuations lists creation complete!")

#word2vec computation
whole_text_unnested = []
whole_text_nested = []
tqdm.pandas(desc="Cleaning text")

for school in tqdm(df['text'], desc="Cleaning text"):
    doc = []
    for chunk in school.split("\n"):
        for sent in sent_tokenize(chunk):
            sent = clean_sentence_apache(sent,
                                         unhyphenate=True,
                                         remove_propernouns=False,
                                         remove_acronyms=False)
            sent = [word for word in sent if word != '']
            if len(sent) > 0:
                whole_text_unnested.append(sent)
                doc.append(sent)
    whole_text_nested.append(doc)

print("Saving the Cleaned Sentences as lists...")
print("Saving List 1: Flattened list")
quickpickle_dump(
    whole_text_unnested,
    "../../../models_storage/word_embeddings_data/cleaned_text_flat_2020_oct17_1990.pkl"
)
print("Pickle file 1 saved!")
print("Saving List 2: Nested list")
示例#3
0
def preprocess_text(article,
                    shorten=False,
                    longest=999999,
                    shortest=0,
                    maxlen=999999,
                    minlen=0):
    '''
    Cleans up articles by removing page marker junk, 
    unicode formatting, and extra whitespaces; 
    re-joining words split by (hyphenated at) end of line; 
    removing numbers (by default) and acronyms (not by default); 
    tokenizing sentences into words using the Apache Lucene Tokenizer (same as JSTOR); 
    lower-casing words; 
    removing stopwords (same as JSTOR), junk formatting words, junk sentence fragments, 
    and proper nouns (the last not by default).
    
    Args:
        article (str): lots of sentences with punctuation etc, often long
        shorten (boolean): if True, shorten sentences to at most maxlen words
        longest (int): number of words in longest article in corpus (get this elsewhere)
        shortest (int): number of words in shortest article in corpus (depends on filtering)
        maxlen (int): maximum number of words to return per article; default is huge number, set lower if shorten == True
        minlen (int): minimum number of words to return per article
        
    Returns:
        list of lists of str: each element of list is a sentence, each sentence is a list of words
    '''

    # Remove page marker junk
    article = article.replace('<plain_text><page sequence="1">', '')
    article = re.sub(r'</page>(\<.*?\>)', ' \n ', article)
    article = re.sub(r'<.*?>', '', article)
    article = re.sub(r'<.+?>', '', article)
    article = re.sub(r'\\\w+(\[.*?\])*(\{.*?\})*', '', article)

    # Compute maximum length for this article: from minlen to maxlen, gradated depending on longest
    if shorten:
        article_length = len(article.split(
        ))  # tokenize (split by spaces) then count # words in article

        if article_length > minlen:  # if article is longer than minimum length to extract, decide how much to extract
            maxlen = get_maxlen(article_length, longest, shortest, maxlen,
                                minlen)
        elif article_length <= minlen:  # if article isn't longer than minimum length to extract, just take whole thing
            shorten = False  # don't shorten

    doc = []  # list to hold tokenized sentences making up article
    numwords = 0  # initialize word counter

    if shorten:
        while numwords < maxlen:  # continue adding words until reaching maxlen
            for sent in article.split('\n'):
                #sent = clean_sent(sent)
                sent = [
                    word
                    for word in clean_sentence_apache(sent,
                                                      unhyphenate=True,
                                                      remove_numbers=True,
                                                      remove_acronyms=False,
                                                      remove_stopwords=False,
                                                      remove_propernouns=False,
                                                      return_string=False)
                    if word != ''
                ]  # remove empty strings

                if numwords < maxlen and len(sent) > 0:
                    gap = int(maxlen - numwords)
                    if len(
                            sent
                    ) > gap:  # if sentence is bigger than gap between current numwords and max # words, shorten it
                        sent = sent[:gap]
                    doc.append(sent)
                    numwords += len(sent)

                if len(sent) > 0:
                    doc.append(sent)
                    numwords += len(sent)

    else:  # take whole sentence (don't shorten)
        for sent in article.split('\n'):
            #sent = clean_sent(sent)
            sent = [
                word
                for word in clean_sentence_apache(sent,
                                                  unhyphenate=True,
                                                  remove_numbers=True,
                                                  remove_acronyms=False,
                                                  remove_stopwords=False,
                                                  remove_propernouns=False,
                                                  return_string=False)
                if word != ''
            ]  # remove empty strings

            if len(sent) > 0:
                doc.append(sent)

    return doc
articles = pd.read_csv("../../../models_storage/word_embeddings_data/filtered_index.csv", names=colnames, header=None)
files_to_be_opened = ["../../../jstor_data/ocr/" + file + '.txt' for file in articles.file_name]
all_files = ['../../../jstor_data/ocr/' + f for f in listdir(ocr_wd) if isfile(join(ocr_wd, f))]

files = [file for file in all_files if file in files_to_be_opened]

#initializing two lists for strings from files and the filenames
text_ls = []
filename_ls = []
index = 1
for file in files:
    with open(file, 'r') as myfile:
        data = myfile.read()
    data = data.replace('<plain_text><page sequence="1">', '')
    data = re.sub(r'</page>(\<.*?\>)', ' \n ', data)
    data = clean_sentence_apache(data, unhyphenate=True, remove_propernouns=False, remove_acronyms=False, return_string=True)
    text_ls.append(data)
    filename_ls.append(file.replace('../ocr/', ''))
    if index % 1000 == 0:
        print("Cleaned ", index, " documents.") 
    index += 1

print("Text Cleaning completed!")

d = {'filename': filename_ls, 'text': text_ls}
df = pd.DataFrame(d)

print("Shortening texts...")

df["edited_filename"] = df['filename'].apply(lambda x: x[40:-4])
df.text = df.text.apply(lambda x: x[:10000] if len(x) > 10000 else x) #cutting down to 10000 words max