def save_data(df, db_connection): """ Write dataframe to SQL database Parameters ---------- df : dataframe db_connection : database connection """ try: connection = db_connection.connect() # empty transition 'temptable' table connection.execute("TRUNCATE temptable") # write result to 'temptable' table df.to_sql('temptable', db_connection, if_exists='append', index_label='id') # use INSERT IGNORE to just skip already existing letters in the 'list' table connection.execute("INSERT IGNORE INTO list SELECT * FROM temptable") connection.close() except Exception as e: ut.error("Failed to write dataframe to database.") ut.progress(f"Detailed error:\n{e}") return 0 else: ut.progress("Dataframe written to database.") return 1
def get_letter_list(db_connection, year, office): """ Load data of letters from database to dataframe. Parameters ---------- db_connection year: list a list of years to include in the query office: list a list of offices to include in the query Returns ------- Dataframe with all letters loaded from database """ # compose query query = f"WHERE YEAR(issued) IN ({', '.join(year)}) AND office REGEXP '{'|'.join(office)}'" # execute query and load to dataframe try: connection = db_connection.connect() df = pd.read_sql(f"SELECT * FROM list {query};", connection) except Exception as e: ut.error("Failed to retrieve dataframe from database.") ut.progress(f"Detailed error:\n{e}") return 0 else: #ut.progress("Data retrieved from database.") return df
def save_letter(letter, filename, folder): """ Write letter to txt file Parameters ---------- letter : String trimmed text ready for writing to file filename : String filename without ".txt" folder : String name of existing folder Returns ------- int 1 in case of success, 0 in case of failure """ file = f'{folder}/{filename}.txt' if exists(file): ut.error(f"Target file {filename} already exists.") return 0 try: f = open(file, 'w') except: ut.error(f"Cannot create or open {filename}.") else: f.write(letter) f.close() ut.progress(f"Successfully written letter to {filename}.") return 1
def crawl_site(url, page_count_max, skip_page=0): """ Crawl website by navigating through different pages. Parameters ---------- url : String first webpage page_count_max : int maximum number of pages to crawl Returns ------- site_df: Dataframe """ driver = make_driver() driver.get(url) # wait 2 seconds for page to load time.sleep(2) page_count = 1 # if we need to skip pages while skip_page > 0: go_next_page(driver) skip_page -= 1 time.sleep(2) # crawl first page and store data in new site dataframe page_data = crawl_page(driver) df_columns = ['posted', 'issued', 'company', 'office', 'subject', 'link'] site_df = pd.DataFrame( page_data, columns=df_columns, index=[get_unique_id(link) for link in page_data['link']]) ut.progress("First page crawled and saved in dataframe.") # navigate to second and subsequent pages up to page_count_max while go_next_page(driver) and page_count < page_count_max: page_count += 1 # wait 2 seconds to not overload server time.sleep(2) # crawl page and append data to site dataframe page_data = crawl_page(driver) page_df = pd.DataFrame( page_data, columns=df_columns, index=[get_unique_id(link) for link in page_data['link']]) site_df = site_df.append(page_df) ut.progress(f"Page {page_count} crawled and saved in dataframe.") return site_df
def endElement(self, tag): if (tag == "page"): self.pageCount += 1 total = 19797 progress(self.pageCount, total, status='Creating Index!!') # if(self.pageCount%1000==0 or self.pageCount>19600 ): # print(self.pageCount, self.pp.totalTokens) # print(self.bufID) # print(self.bufTitle) # print(self.bufText) self.id = False self.firstID = True ## build you index here page by page titleTokens = self.pp.processTitle(self.bufTitle) infoboxTokens, catgoriesTokens, referencesTokens, bodyTokens, externalLinksTokens = self.pp.processText( self.bufText) self.invertedIndex.buildIndex(self.bufID, titleTokens, infoboxTokens, catgoriesTokens, referencesTokens, bodyTokens, externalLinksTokens) self.bufID = "" self.bufTitle = "" self.bufText = "" elif (tag == "id"): self.id = False if (tag == "title"): self.title = False elif (tag == "text"): self.text = False elif (tag == "mediawiki"): print("\nTotal Tokens :", self.pp.totalTokens) statFile = open(utility.getIndexPath(), 'w+') print("inverted_stat.txt path :", utility.getIndexPath()) statFile.write("Total Tokens :" + str(self.pp.totalTokens) + '\n') statFile.close()
def get_letter_list(db_connection): """ Load data of letters from database to dataframe. Parameters ---------- db_connection Returns ------- Dataframe with all letters loaded from database """ try: connection = db_connection.connect() df = pd.read_sql("SELECT * FROM list;", connection) except Exception as e: ut.error("Failed to retrieve dataframe from database.") ut.progress(f"Detailed error:\n{e}") return 0 else: ut.progress("Data retrieved from database.") return df
def writeIndex(self, fileName): self.totalInvertedTokens = len(self.index) print("totalInvertedTokens::", self.totalInvertedTokens) statFile = open(utility.getStatPath(), 'a+') statFile.write("Total Inverted Tokens :" + str(self.totalInvertedTokens) + '\n') statFile.close() indexSize = len(self.index) j = 0 file = open(fileName, 'w+') for key in self.index: file.write(key) file.write(';') for element in self.index[key]: for i in range(0, 6): if (i == 0 and element[i] != 0): file.write('d' + str(element[i])) elif (i == 1 and element[i] != 0): file.write('t' + str(element[i])) elif (i == 2 and element[i] != 0): file.write('i' + str(element[i])) elif (i == 3 and element[i] != 0): file.write('c' + str(element[i])) elif (i == 4 and element[i] != 0): file.write('r' + str(element[i])) elif (i == 5 and element[i] != 0): file.write('b' + str(element[i])) elif (i == 6 and element[i] != 0): file.write('e' + str(element[i])) file.write(';') file.write('\n') progress(j, indexSize, 'writing index!!') j += 1 file.close()