示例#1
0
def abstract_page_scraper(abstract_url, abstract_input_tag_id,
                          abstracts_log_name, permanent_word_sorter_list,
                          trend_keywords, site_url_index, status_logger_name):
    '''This function is written to scrape the actual abstract of the specific paper,
	 that is being referenced within the list of abstracts'''
    abstract_page_scraper_status_key = "Abstract ID:" + " " + abstract_input_tag_id
    status_logger(status_logger_name, abstract_page_scraper_status_key)

    abstract_page_url = abstract_url + abstract_input_tag_id
    abstract_page = url_reader(abstract_page_url, status_logger_name)
    abstract_soup = page_souper(abstract_page, status_logger_name)
    title = title_scraper(abstract_soup, status_logger_name)
    abstract_date = abstract_date_scraper(title, abstract_soup,
                                          status_logger_name)
    '''Due to repeated attribute errors with respect to scraping the authors name, these failsafes had to be put in place.'''
    try:
        author = author_scraper(abstract_soup, status_logger_name)
    except AttributeError:
        author = "Author not available"
    '''Due to repeated attribute errors with respect to scraping the abstract, these failsafes had to be put in place.'''
    try:
        abstract = abstract_scraper(abstract_soup)
        abstract_word_extractor(abstract, title, abstract_date,
                                permanent_word_sorter_list, trend_keywords,
                                status_logger_name)
    except AttributeError:
        abstract = "Abstract not available"

    abstract_database_writer(abstract_page_url, title, author, abstract,
                             abstracts_log_name, abstract_date,
                             status_logger_name)
    analytical_abstract_database_writer(title, author, abstract,
                                        abstracts_log_name, status_logger_name)
示例#2
0
def abstract_database_writer(abstract_page_url, title, author, abstract,
                             abstracts_log_name, abstract_date,
                             status_logger_name):
    '''This function makes text files to contain the abstracts for future reference.
	It holds: 1) Title, 2) Author(s), 3) Abstract'''
    abstract_database_writer_start_status_key = "Writing" + " " + title + " " + "by" + " " + author + " " + "to disc"
    status_logger(status_logger_name,
                  abstract_database_writer_start_status_key)

    abstracts_csv_log = open(abstracts_log_name + '.csv', 'a')
    abstracts_txt_log = open(abstracts_log_name + '.txt', 'a')
    abstracts_txt_log.write("Title:" + " " + title)
    abstracts_txt_log.write('\n')
    abstracts_txt_log.write("Author:" + " " + author)
    abstracts_txt_log.write('\n')
    abstracts_txt_log.write("Date:" + " " + abstract_date)
    abstracts_txt_log.write('\n')
    abstracts_txt_log.write("URL:" + " " + abstract_page_url)
    abstracts_txt_log.write('\n')
    abstracts_txt_log.write("Abstract:" + " " + abstract)
    abstracts_csv_log.write(abstract)
    abstracts_csv_log.write('\n')
    abstracts_txt_log.write('\n' + '\n')
    abstracts_txt_log.close()
    abstracts_csv_log.close()

    abstract_database_writer_stop_status_key = "Written" + " " + title + " " + "to disc"
    status_logger(status_logger_name, abstract_database_writer_stop_status_key)
示例#3
0
def title_scraper(abstract_soup, status_logger_name):
    '''This function scrapes the title of the text from the abstract'''
    title_scraper_start_status_key = "Scraping the title of the abstract"
    status_logger(status_logger_name, title_scraper_start_status_key)
    '''Purpose of this block is to retrieve the title of the text even if an AttributeError arises'''
    try:
        title = str(
            abstract_soup.find('h1', {
                'class': 'c-article-title'
            }).text.encode('utf-8'))[1:]
        '''In case an incorrectly classified asset is to be scrapped (Journal/Chapter as opposed to Article), go through this block in an attempt to retrieve the title.'''
    except AttributeError:
        try:
            title = str(
                abstract_soup.find('h1', {
                    'class': 'ChapterTitle'
                }).text.encode('utf-8'))[1:]
        except AttributeError:
            try:
                title = (abstract_soup.find('span', {
                    'class': 'JournalTitle'
                }).text)
            except AttributeError:
                title = "Title not available"

    title_scraper_end_status_key = "Scraped the title of the abstract"
    status_logger(status_logger_name, title_scraper_end_status_key)

    return title
示例#4
0
def list_cleaner(list_to_be_cleaned, status_logger_name):
    list_cleaner_start_status_key = "Cleaning the list of words generated"
    status_logger(status_logger_name, list_cleaner_start_status_key)
    '''This function cleans the list containing the words found in the abstract. It eliminates words found in
	another pre-defined list of words.'''
    words_to_be_eliminated = [
        "the", "of", "and", "in", "to", "a", "is", "for", "from", "with",
        "that", "by", "are", "on", "was", "as", "were", "url:", "abstract:",
        "abstract", "author:", "title:", "at", "be", "an", "during", "have",
        "this", "which", "study", "been", "species", "not", "has", "between",
        "using", "its", "also", "these", "this", "used", "over", "can",
        "within", "into", "all", "due", "use", "about", "a", 'it', 'their',
        "where", "we", "most", "may", "through", "though", "like", "or",
        "further", "e.g.", "along", "any", "those", "had", "toward", "due",
        "both", "some", "use", "even", "more", "but", "while", "pass", "well",
        "will", "when", "only", "after", "author", "title", "there", "our",
        "did", "much", "as", "if", "become", "still", "various", "very", "out",
        "they", "via", "available", "such", "than", "different", "many",
        "areas", "no", "one", "two", "small", "first", "other", "such", "-",
        "could", "studies", "high", "provide", "among", "highly", "no", "case",
        "across", "given", "need", "would", "under", "found", "low", "values",
        "xe2\\x80\\x89", "xa", "xc", "xb", "\xc2\xa0C\xc2\xa0ha\xe2\x88\x921",
        "suggest", "up", "'The", "area"
    ]
    cleaned_list_of_words_in_abstract = [
        item for item in list_to_be_cleaned
        if item not in words_to_be_eliminated
    ]

    list_cleaner_end_status_key = "Cleaned the list of words generated"
    status_logger(status_logger_name, list_cleaner_end_status_key)

    return cleaned_list_of_words_in_abstract
示例#5
0
def rm_original_folder(logs_folder_name, status_logger_name):
    '''This function deletes the logs folder generated once the .tar.gz file has been created.'''
    rm_original_folder_start_status_key = "Deleting files belonging to:" + " " + logs_folder_name
    status_logger(status_logger_name, rm_original_folder_start_status_key)

    command_to_rm_function = "rm -r" + " " + logs_folder_name

    os.system(command_to_rm_function)
示例#6
0
def tarballer(logs_folder_name, status_logger_name):
    '''This function prepares the tar ball of the LOG file.'''
    tarballer_start_status_key = "Tarballing" + " " + logs_folder_name + " " + "into" + " " + logs_folder_name + ".tar.gz"
    status_logger(status_logger_name, tarballer_start_status_key)

    command_to_tar_function = "tar czf" + " " + logs_folder_name + ".tar.gz" + " " + logs_folder_name
    os.system(command_to_tar_function)

    tarballer_start_end_key = "Tarballed" + " " + logs_folder_name + " " + "into" + " " + logs_folder_name + ".tar.gz"
    status_logger(status_logger_name, tarballer_start_end_key)
示例#7
0
def remove_stopwords(textual_data, status_logger_name):
	'''This function removes the standard set of stopwords from the corpus of abstract words.
	We've added a bunch of other words in addition.'''
	remove_stopwords_start_status_key = "Removing stopwords"
	status_logger(status_logger_name, remove_stopwords_start_status_key)
	
	return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in textual_data]
	
	remove_stopwords_end_status_key = "Removed stopwords"
	status_logger(status_logger_name, remove_stopwords_end_status_key)
示例#8
0
def make_bigrams(textual_data, status_logger_name):
	'''Generates multiple bigrams of word pairs in phrases that commonly occuring with each other over the corpus'''
	make_bigrams_start_status_key = "Generating bigrams"
	status_logger(status_logger_name, make_bigrams_start_status_key)

	bigram_mod = bigram_generator(textual_data, status_logger_name)
	return [bigram_mod[doc] for doc in textual_data]
	
	make_bigrams_end_status_key = "Generated bigrams"
	status_logger(status_logger_name, make_bigrams_end_status_key)
示例#9
0
def textual_data_trimmer(textual_dataframe, status_logger_name):
	'''Converts each of the abstracts in the file into a list element, of size = (number of abstracts)'''
	textual_data_trimmer_start_status_key = "Trimming data and preparing list of words"
	status_logger(status_logger_name, textual_data_trimmer_start_status_key)

	textual_data = textual_dataframe.values.tolist()

	textual_data_trimmer_end_status_key = "Trimmed data and prepared list of words"
	status_logger(status_logger_name, textual_data_trimmer_end_status_key)

	return textual_data
示例#10
0
def data_reader(abstracts_log_name, status_logger_name):
	'''This wherer the file is being parsed from to the model'''
	data_reader_start_status_key = abstracts_log_name+".txt is being ported to dataframe"
	status_logger(status_logger_name, data_reader_start_status_key)

	textual_dataframe = pd.read_csv(abstracts_log_name+'_'+'CLEANED'+'.txt', delimiter="\t")

	data_reader_end_status_key = abstracts_log_name+".txt has been ported to dataframe"	
	status_logger(status_logger_name, data_reader_end_status_key)

	return textual_dataframe
示例#11
0
def analyzer_main(abstracts_log_name, status_logger_name):
    '''Declaring the actual analyzer_main function is integrated to Bias.py code'''
    analyzer_main_status_key = "Entered the Analyzer.py code."
    status_logger(status_logger_name, analyzer_main_status_key)
    '''Calling the pre-processing and transfer functions here'''
    abstracts_txt_file_name, abstracts_csv_file_name = analyzer_pre_processing(
        abstracts_log_name, status_logger_name)
    transfer_function(abstracts_txt_file_name, abstracts_csv_file_name,
                      status_logger_name)
    '''Logs the end of the process Analyzer code in the status_logger'''
    analyzer_main_status_key = "Exiting the Analyzer.py code."
    status_logger(status_logger_name, analyzer_main_status_key)
示例#12
0
def analyzer_pre_processing(abstracts_log_name, status_logger_name):
    '''Carries out the pre-processing tasks, such as folder creation'''
    analyzer_pre_processing_status_key = "Carrying out pre-processing functions for analyzer"
    status_logger(status_logger_name, analyzer_pre_processing_status_key)
    '''This code strips the abstracts_log_name of its extension and adds a .csv to it'''
    abstracts_csv_file_name = (os.path.splitext(abstracts_log_name)[0]
                               ) + "_" + "FREQUENCY_CSV_DATA" + ".csv"
    abstracts_txt_file_name = abstracts_log_name + "_" + "CLEANED" + ".txt"

    analyzer_pre_processing_status_key = "Carried out pre-processing functions for analyzer"
    status_logger(status_logger_name, analyzer_pre_processing_status_key)
    return abstracts_txt_file_name, abstracts_csv_file_name
示例#13
0
def page_souper(page, status_logger_name):
    '''Function soups the webpage elements and provided the tags for search.
	Note: Appropriate encoding has to be picked up beenfore souping'''
    page_souper_start_status_key = "Souping page"
    status_logger(status_logger_name, page_souper_start_status_key)

    page_soup = bs(page, 'html.parser')

    page_souper_stop_status_key = "Souped page"
    status_logger(status_logger_name, page_souper_stop_status_key)

    return page_soup
示例#14
0
def bigram_generator(textual_data, status_logger_name):
	'''Generating bigram model from the words that are in the corpus.'''
	'''Bigrams: Words that occur together with a high frequency,'''
	bigram_generator_start_status_key = "Generating word bigrams"
	status_logger(status_logger_name, bigram_generator_start_status_key)
	
	bigram = gensim.models.Phrases(textual_data, min_count=5, threshold=100)
	bigram_mod = gensim.models.phrases.Phraser(bigram)

	bigram_generator_end_status_key = "Generated word bigrams"
	status_logger(status_logger_name, bigram_generator_end_status_key)	

	return bigram_mod
示例#15
0
def word_sorter_list_generator(status_logger_name):
    word_sorter_list_generator_start_status_key = "Generating the permanent archival list"
    status_logger(status_logger_name,
                  word_sorter_list_generator_start_status_key)
    '''This function generates the list that hold the Words and corresponding Years of the
	abstract data words before the actual recursion of scrapping data from the website begins.'''
    word_sorter_list = []

    word_sorter_list_generator_exit_status_key = "Generated the permanent archival list"
    status_logger(status_logger_name,
                  word_sorter_list_generator_exit_status_key)

    return word_sorter_list
示例#16
0
def sent_to_words(textual_data, status_logger_name):
	'''Removing unecessary characters and removing punctuations from the corpus. Resultant words are then tokenized.'''
	sent_to_words_start_status_key = "Tokenizing words"
	status_logger(status_logger_name, sent_to_words_start_status_key)

	for sentence in textual_data:
		yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
	textual_data = list(sent_to_words(textual_data, status_logger_name))
	
	sent_to_words_end_status_key = "Tokenized words"
	status_logger(status_logger_name, sent_to_words_end_status_key)	

	return textual_data
示例#17
0
def abstract_year_list_post_processor(permanent_word_sorter_list,
                                      status_logger_name):
    '''Because of this function we have a dictionary containing the frequency of occurrence of terms in specific years'''
    abstract_year_list_post_processor_start_status_key = "Post processing of permanent word sorter list has commenced"
    status_logger(status_logger_name,
                  abstract_year_list_post_processor_start_status_key)

    abstract_year_dictionary = Counter(permanent_word_sorter_list)

    abstract_year_list_post_processor_end_status_key = "Post processing of permanent word sorter list has completed"
    status_logger(status_logger_name,
                  abstract_year_list_post_processor_end_status_key)

    return abstract_year_dictionary
示例#18
0
def visualizer_generator(lda_model, corpus, id2word, logs_folder_name,
                         status_logger_name):
    '''This code generates the .html file with generates the visualization of the data prepared.'''
    visualizer_generator_start_status_key = "Preparing the topic modeling visualization"
    status_logger(status_logger_name, visualizer_generator_start_status_key)

    textual_data_visualization = pyLDAvis.gensim.prepare(
        lda_model, corpus, id2word)
    pyLDAvis.save_html(
        textual_data_visualization,
        logs_folder_name + "/" + "Data_Visualization_Topic_Modelling.html")

    visualizer_generator_end_status_key = "Prepared the topic modeling visualization" + " " + logs_folder_name + "/" + "Data_Visualization_Topic_Modelling.html"
    status_logger(status_logger_name, visualizer_generator_end_status_key)
示例#19
0
def abstract_crawler(abstract_url, abstract_id_log_name, abstracts_log_name,
                     permanent_word_sorter_list, trend_keywords,
                     site_url_index, status_logger_name):
    abstract_crawler_start_status_key = "Entered the Abstract Crawler"
    status_logger(status_logger_name, abstract_crawler_start_status_key)

    abstract_crawler_temp_index = site_url_index
    '''This function crawls the page and access each and every abstract'''
    abstract_input_tag_ids = abstract_id_database_reader(
        abstract_id_log_name, abstract_crawler_temp_index, status_logger_name)
    for abstract_input_tag_id in abstract_input_tag_ids:
        try:
            abstract_crawler_accept_status_key = "Abstract Number:" + " " + str(
                (abstract_input_tag_ids.index(abstract_input_tag_id) + 1) +
                abstract_crawler_temp_index * 20)
            status_logger(status_logger_name,
                          abstract_crawler_accept_status_key)
            abstract_page_scraper(abstract_url, abstract_input_tag_id,
                                  abstracts_log_name,
                                  permanent_word_sorter_list, trend_keywords,
                                  site_url_index, status_logger_name)
        except TypeError:
            abstract_crawler_reject_status_key = "Abstract Number:" + " " + str(
                abstract_input_tag_ids.index(abstract_input_tag_id) +
                1) + " " + "could not be processed"
            status_logger(status_logger_name,
                          abstract_crawler_reject_status_key)
            pass

    abstract_crawler_end_status_key = "Exiting the Abstract Crawler"
    status_logger(status_logger_name, abstract_crawler_end_status_key)
示例#20
0
def lemmatization(status_logger_name, textual_data, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
	'''Reducing a word to the root word. Running  -> Run for example'''
	lemmatization_start_status_key = "Beginning lemmatization"
	status_logger(status_logger_name, lemmatization_start_status_key)

	texts_out = []
	nlp = spacy.load('en', disable=['parser', 'ner'])
	for sent in textual_data:
		doc = nlp(" ".join(sent))
		texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

	lemmatization_end_status_key = "Ending lemmatization"
	status_logger(status_logger_name, lemmatization_end_status_key)

	return texts_out
示例#21
0
def delay_function(status_logger_name):
    '''Since the Springer servers are contstantly shutting down the remote connection, we introduce
	this function in the processor function in order to reduce the number of pings it delivers to the remote.'''

    delay_variable = np.random.randint(0, 20)

    delay_function_start_status_key = "Delaying remote server ping:" + " " + str(
        delay_variable) + " " + "seconds"
    status_logger(status_logger_name, delay_function_start_status_key)
    '''Sleep parameter causes the code to be be delayed by 1 second'''
    time.sleep(delay_variable)

    delay_function_end_status_key = "Delayed remote server ping:" + " " + str(
        delay_variable) + " " + "seconds"
    status_logger(status_logger_name, delay_function_end_status_key)
示例#22
0
def abstract_id_scraper(abstract_id_log_name, page_soup, site_url_index,
                        status_logger_name):
    '''This function helps in obtaining the PII number of the abstract.
	This number is then coupled with the dynamic URL and provides'''
    abstract_id_scraper_start_status_key = "Scraping IDs"
    status_logger(status_logger_name, abstract_id_scraper_start_status_key)
    ''''This statement collects all the input tags that have the abstract ids in them'''
    abstract_input_tags = page_soup.findAll('a', {'class': 'title'})
    for abstract_input_tag in abstract_input_tags:
        abstract_input_tag_id = abstract_input_tag.get('href')
        abstract_id_database_writer(abstract_id_log_name,
                                    abstract_input_tag_id, site_url_index)

    abstract_id_scraper_stop_status_key = "Scraped IDs"
    status_logger(status_logger_name, abstract_id_scraper_stop_status_key)
示例#23
0
def results_determiner(url, status_logger_name):
    '''This function determines the number of results that a particular keywords returns
	once it looks up the keyword on link.springer.com
	The function returns all the possible links containing results and then provides the total number of results
	returned by a particular keyword, or combination of keywords.'''
    first_page_to_scrape = url_reader(url, status_logger_name)

    first_page_to_scrape_soup = page_souper(first_page_to_scrape,
                                            status_logger_name)
    number_of_results = first_page_to_scrape_soup.find(
        'h1', {
            'id': 'number-of-search-results-and-search-terms'
        }).find('strong').text

    results_determiner_status_key = "Total number of results obtained: " + number_of_results
    status_logger(status_logger_name, results_determiner_status_key)
示例#24
0
def dirty_element_generator(texts, status_logger_name):
    '''Finds all the elements which have the special character in them, makes a list and
    referes through them durng the next phases'''
    dirty_element_generator_start_status_key = "Generating list with special elements for weeding out later"
    status_logger(status_logger_name, dirty_element_generator_start_status_key)

    for text in texts:
        elements = text.split(" ")
        for element in elements:
            if ('\\' in element):
                dirty_elements.append(element)

    dirty_element_generator_end_status_key = "Generated list with special elements for weeding out later"
    status_logger(status_logger_name, dirty_element_generator_end_status_key)

    return dirty_elements
示例#25
0
def author_scraper(abstract_soup, status_logger_name):
    '''This function scrapes the author of the text, for easy navigation and search'''
    author_scraper_start_status_key = "Scraping the author name"
    status_logger(status_logger_name, author_scraper_start_status_key)
    '''This class element's text attribute contains all the authors names. It is converted to a findAll() list and then concatinated into a string for storage.'''
    author = ''.join(
        str(author) for author in [
            authorElement.text for authorElement in abstract_soup.findAll(
                'li', {'class': 'c-author-list__item'})
        ])

    author_scraper_end_status_key = "Scraped the author's name:" + " " + str(
        author)
    status_logger(status_logger_name, author_scraper_end_status_key)

    return author
示例#26
0
def abstract_date_scraper(title, abstract_soup, status_logger_name):
    '''This function scrapes the date associated with each of the abstracts.
	This function will play a crucial role in the functionality that we are trying to build into our project.'''
    date_scraper_entry_status_key = "Scraping date of the abstract titled:" + " " + title
    status_logger(status_logger_name, date_scraper_entry_status_key)

    try:
        abstract_date = abstract_soup.find('time').get('datetime')
        date_scraper_exit_status_key = title + " " + "was published on" + " " + abstract_date
    except AttributeError:
        abstract_date = "Date for abstract titled:" + " " + title + " " + "was not available"
        date_scraper_exit_status_key = abstract_date
        pass

    status_logger(status_logger_name, date_scraper_exit_status_key)

    return abstract_date
示例#27
0
def txt_to_list(abstract_directory, status_logger_name):
    '''Converting the text file to a list for easier processing'''
    txt_to_list_start_status_key = "Converting text to list"
    status_logger(status_logger_name, txt_to_list_start_status_key)

    cleaner_abstract_directory = (
        abstract_directory.split(".txt")[0]) + "_" + 'ANALYTICAL.txt'
    folder = open(cleaner_abstract_directory, 'r')
    abstracts = []

    for line in folder:
        abstracts.append(line)

    txt_to_list_end_status_key = "Converted text to list"
    status_logger(status_logger_name, txt_to_list_end_status_key)

    return abstracts
示例#28
0
def cleaner_main(abstract_directory, status_logger_name):
    '''This module removes all the special characters from the abstract scrapped using the Bias tool.'''
    cleaner_main_start_status_key = "Entering the Cleaner module"
    status_logger(status_logger_name, cleaner_main_start_status_key)

    abstracts = txt_to_list(abstract_directory, status_logger_name)
    dirty_elements = dirty_element_generator(abstracts, status_logger_name)
    cleaned_texts = dirty_element_weeder(abstracts, dirty_elements,
                                         status_logger_name)
    new_cleaned_texts_folder = cleaned_abstract_dumper(abstract_directory,
                                                       cleaned_texts,
                                                       status_logger_name)
    '''Main contribution from this block of the code is the new cleaned .txt folder and cleaned abstracts. Just in case.'''

    cleaner_main_end_status_key = "Exiting the Cleaner module"
    status_logger(status_logger_name, cleaner_main_end_status_key)

    return cleaned_texts, new_cleaned_texts_folder
示例#29
0
def dirty_element_weeder(texts, dirty_elements, status_logger_name):
    '''Refers to the list of dirty variables and cleans the abstracts'''
    dirty_element_weeder_start_status_key = "Removing elements with special characters from the text list"
    status_logger(status_logger_name, dirty_element_weeder_start_status_key)

    cleaned_str_list = []
    for text in texts:
        elements = text.split(" ")
        for element in elements:
            if element not in dirty_elements:
                cleaned_str_list.append(element)
        cleaned_texts.append(" ".join(lol for lol in cleaned_str_list))
        cleaned_str_list = []

    dirty_element_weeder_end_status_key = "Removed elements with special characters from the text list"
    status_logger(status_logger_name, dirty_element_weeder_end_status_key)

    return cleaned_texts
示例#30
0
def cleaned_abstract_dumper(abstract_directory, cleaned_texts,
                            status_logger_name):
    '''Dumping the cleaned abstracts to the disc and will be referring to it henceforth in the code'''
    cleaned_abstract_dumper_start_status_key = "Dumping the cleaned abstract .txt to the disc"
    status_logger(status_logger_name, cleaned_abstract_dumper_start_status_key)

    pre_new_cleaned_texts_folder = abstract_directory.split(".txt")[0]
    new_cleaned_texts_folder = open(
        pre_new_cleaned_texts_folder + "_" + "CLEANED.txt", 'w')

    for cleaned_text in cleaned_texts:
        new_cleaned_texts_folder.write(cleaned_text)
        new_cleaned_texts_folder.write('\n')

    cleaned_abstract_dumper_end_status_key = "Dumped the cleaned abstract .txt to the disc"
    status_logger(status_logger_name, cleaned_abstract_dumper_end_status_key)

    return new_cleaned_texts_folder