def add_addresses(directory_path, master_df_path, master_df_id_col, master_df_address_cols, output_directory, new_address_col_name, id_col=None): files = get_files(directory_path, "csv") print(str(len(files)) + " files found in directory") ext = master_df_path.split(".")[-1] if (ext == "csv"): master_df = pd.read_csv(master_df_path) elif (ext == "xlsx"): master_df = pd.read_excel(master_df_path) else: raise ValueError("Invalid file extension: " + ext) for file in files: print("Current file: " + file, end='\r') add_address(file_path=file, master_df=master_df, master_df_id_col=master_df_id_col, master_df_address_cols=master_df_address_cols, output_directory=output_directory, new_address_col_name=new_address_col_name, id_col=id_col)
def run(self, day = global_define.TODAY): _dir = os.path.join(global_define.TEXT_DIR, str(day)) file_paths = functs.get_files(_dir) for file_path in file_paths: self.build(file_path) self.flush() pass
def run(self, day = global_define.TODAY): _dir = os.path.join(global_define.XML_DIR, str(day)) file_paths = functs.get_files(_dir) for i,file_path in enumerate(file_paths): xml_dicts = self.parse(file_path) if xml_dicts: self.dump(xml_dicts, str(i), day) pass
def get_subdirectory_files(file_string): subdirectory_csv_files = [] _, subdirectories, _ = os.walk(os.getcwd()).next() for subdirectory in subdirectories: subdirectory_path = os.path.join(os.getcwd(), subdirectory) for subdirectory_csv_file in utilities.get_files(subdirectory_path, file_string): subdirectory_csv_files.append(subdirectory_csv_file) return subdirectory_csv_files
def combine_directory(directory_path, text_directory, text_directory_relative, output_directory, address_col, time_col_name, company_name, output_filename=None, current_date=None, date_col_name="date", year_column_name="year", rating_column_name="stars", id_rule="col", id_col="Store Company ID"): files = get_files(directory_path, "csv") records = [] string_with_date = directory_path.split("\\")[-1] for file in files: df = pd.read_csv(file) # get id if id_rule == "col": company_id = list(df[id_col])[0] elif id_rule == "filename": file_split = file.split("\\") if (len(file_split[-1]) == 0): filename = file_split[-2] else: filename = file_split[-1] company_id = re.search(r"\d{10}", filename).group(0) # process dates date_converter(df, time_col_name, string_with_date, date_col_name, current_date, year_column_name) # create record and append to list records.append( create_record(df, company_id, df[address_col][0], company_name, text_directory, text_directory_relative, year_column=year_column_name, rating_column=rating_column_name)) combined_df = pd.DataFrame.from_records(records) if output_filename is None: final_filename = string_with_date + ".csv" else: final_filename = output_filename + ".csv" final_outdir = output_directory + "\\" + final_filename combined_df.to_csv(final_outdir, index=False)
def main(): site = 'guardian' # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Load and concatenate files') dict_srcs = [ x for x in utilities.get_files(utilities.blm_html_1pass_dir) if 'guardian' in x ] dates_articles_ = utilities.combine_dicts(dict_srcs) utilities.count_articles(dates_articles_) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Parse comments') dates_articles = copy.deepcopy(dates_articles_) for date, days_articles in dates_articles_.items(): for ix, article in enumerate(days_articles): raw_comments_pages = article['raw_comments'] parsed_comments_li = [] for raw_comments_page in raw_comments_pages: raw_comments_soup = bs(raw_comments_page) parsed_comments = get_page_comment_data(raw_comments_soup) parsed_comments_li += parsed_comments dates_articles[date][ix]['parsed_comments'] = parsed_comments_li # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Save outputs, one json per year') years = sorted(list(set([x.split('-')[0] for x in dates_articles.keys()]))) dst_dir = os.path.join(utilities.blm_processed_parsed_dir, '2nd_iteration') for year in years: dates_articles2 = {} dst = os.path.join(dst_dir, site + '_' + year + '.json') for date, days_articles in dates_articles.items(): if year in date: date2 = copy.deepcopy(date) dates_articles2[date2] = copy.deepcopy(days_articles) with open(dst, 'w') as f: json.dump(dates_articles2, f)
def mean_weight(): import pandas as pd from config import Config, get_services, recreate_config_file import sys from utilities import Print_Error, Select_Menu, get_files from data_manager import Generate_Pivot_Table # pd.set_option('display.max_rows',3000) available_services = get_services() available_files = get_files() if (len(available_services) == 0): Print_Error(".config File Not Found! Recreating .config File!") recreate_config_file() return # Use parameters from command prompt if (len(sys.argv) == 1): file_name = Select_Menu(available_files, text="Input File Name", return_type=int) service = Select_Menu(available_services, text="Input Service", return_type=int) file_name = available_files[(file_name)] service = available_services[(service)] vessel_name = file_name.upper().split('/')[-1].rsplit('.', 1)[0] config = Config() import os config.build_config("%s/CONFIG/%s.config" % (os.getcwd(), service.upper())) config.set_vessel(vessel_name.rsplit('.', 1)[0]) config.print_data() Generate_Pivot_Table(config, file_name)
def evaluate_all(data_path, scraped_directory_path, output_directory_path, counts_output_path, id_col, data_count_cols, wait=[1, 1.5], url_col=None, search_terms=None, new_url_col=None, scraped_col_name="reviews_scraped", review_col_name="review_count", diff_col="difference", prop_col="proportion"): count_df = get_review_counts(data_path, id_col, counts_output_path, wait, url_col, search_terms, new_url_col, review_col_name) for file in get_files(scraped_directory_path, "csv"): scraped_path = file scraped_df = pd.read_csv(scraped_path) out_df = pd.merge(count_df, scraped_df, on=[id_col]) total_review_count = out_df[data_count_cols].sum(axis=1) out_df[diff_col] = total_review_count - out_df[review_col_name] out_df[prop_col] = total_review_count / out_df[review_col_name] out_df[scraped_col_name] = total_review_count out_df = out_df[[ id_col, scraped_col_name, diff_col, prop_col, review_col_name ]] filename = file.split("\\")[-1].split(".")[0] output_path = output_directory_path + "\\" + filename + "_evaluation.csv" out_df.to_csv(output_path, index=False) print("Saved file as " + output_path)
def main(): # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Parse comments') # combine scraped blm data blm_srcs = utilities.get_files( utilities.blm_html_1pass_dir) + utilities.get_files( utilities.blm_html_2pass_dir) blm_srcs = [x for x in blm_srcs if site in x] blm = utilities.combine_dicts(blm_srcs) # sort by date blm_sorted = {} for k in sorted(blm.keys()): blm_sorted_v = copy.deepcopy(blm[k]) blm_sorted[k] = blm_sorted_v article_counter = 0 # iterate over websites counter = 0 # for each website, iterate over intermediate pickle files blm_comments = {} # for each loaded dict, iterate over each day's articles for date, days_articles in blm_sorted.items(): days_articles_li = [] # for each article, get cleaned comments for ix, article in enumerate(days_articles): article_copy = copy.deepcopy( article ) # copy so when we change article we don't change blm_sorted article comments = article_copy['comments'] # exclude articles that don't have comments if type(comments) == str: soup = bs(comments) ap = ArticlePosts(comments) article_copy['unparsed_comments'] = ap.make_unparsed_comments( ap.replacements) article_copy['parsed_comments'] = ap.make_parsed_comments() article_copy['raw_comments'] = article_copy.pop('comments') days_articles_li.append(article_copy) counter += 1 article_counter += 1 # only add day's articles if there were any articles for that day if len(days_articles_li) > 0: blm_comments[date] = days_articles_li # at the end of the day, if counter greater than x, save and reinitialize dictionary and counter if counter >= 1000: save_outputs(blm_comments, dst_dir, site) # re-initialize dictionary and counter blm_comments = {} counter = 0 # save remainders if len(blm_comments) > 0: save_outputs(blm_comments, dst_dir, site) print('Total articles with comments = %s' % article_counter)
def merge_incremental_index(self, day = global_define.TODAY): _dir = os.path.join(global_define.INDEX_INCREMENTAL_DIR, str(day)) file_list = functs.get_files(_dir) for file_path in file_list: self._merge(file_path) pass
import utilities import numpy as np import os import re export_fol = "D:\\Apostolis\\Programming\\Python\\Athena_project\\EEG_Project\\data\\export" answer_fol = "D:\\Apostolis\\Programming\\Python\\Athena_project\\EEG_Project\\data\\answers" save_fol = "D:\\Apostolis\\Programming\\Python\\Athena_project\\EEG_Project\\data\\subjects" export_files = utilities.get_files(export_fol) answer_files = utilities.get_files(answer_fol) for file in export_files: # Get subject names from file name sub_name = utilities.get_filename(file, start=4) # Get time zero (t0) of the subject from the info.csv file. # t0 represents the exact ms that the eeg recording started according to iMotions export file with open(os.path.join(save_fol, sub_name, 'info.csv')) as csv: lines = csv.readlines() line = lines[1].split(',') t0 = int(re.sub("[^0-9]", "", line[2])) f = open(file) text = f.readlines() f.close() time_zero = 0 for ind, line in enumerate(text): # Search in text for this particular line which marks the start of the web application if "NavigateComplete\thttp://localhost/exp/main.php" in line: line = text[ind - 2].split('\t')
parser.add_argument('-i', '--image_topic', dest='image_topic', default=None, help='Use specified image topic.') parser.add_argument('--save_stats', action='store_true', help='Save stats to csv file.') parser.add_argument('--make_plots', type=bool, default=True, help='Make pdf of plots of results.') args, args_unkonown = parser.parse_known_args() bag_directory = args.bag_directory if bag_directory == None: bag_directory = os.getcwd() if not os.path.exists(bag_directory): print("Bag directory {} does not exist.".format(bag_directory)) exit() bag_files = utilities.get_files(bag_directory, '*.bag') print("Found {} bag files in {}.".format(len(bag_files), bag_directory)) output_dir = utilities.create_directory(args.output_directory) print('Output directory for results is {}'.format(output_dir)) bag_sweep(bag_files, output_dir, args) combine_results_in_csv_file(bag_files, output_dir)
def __init__(self): dict.__init__(self) files = functs.get_files(global_define.INDEX_PRIME_DIR) for file_path in files: self._load(file_path)
def Deposito(): import pandas as pd from utilities import Print_Error,get_files,Select_Menu,create_directory,OpenFile available_file = [] search_locations = [] save_location = "" main_config = open("CONFIG/MAIN.config") for line in main_config: if (line.split(';')[0] == "search_location"): search_locations.append(line.split(';')[1].strip()) elif (line.split(';')[0] == "save_location"): save_location=line.split(';')[1].strip() for file in get_files(search_locations): if(file.upper().endswith('.XLS')): available_file.append(file) file_name = Select_Menu(available_file,"Select a File",return_type=int) file_name = available_file[(file_name)] if (file_name.upper().endswith(".XLS")): print "Importing XLS File!" sheet = "LinnerBooking" df = pd.read_excel(io=file_name, sheet_name=sheet) df = df[['Booking','Deposito','Weight','Tipo Ctr']] df = df.loc[(df['Deposito'] == "MEDLOG SAN ANTONIO") | (df['Deposito'] == "SITRANS SAI ALTO DEPOT") | (df['Deposito'] == "SITRANS VALPARAISO DEPOT") |(df['Deposito'] == "MEDLOG SANTIAGO")] df['Weight'] = df['Weight']/1000 #Transformar a Tons. # df = df.loc[(df['Tipo Ctr'] == '20DV') | (df['Tipo Ctr'] == '40DV') | (df['Tipo Ctr'] == '40HC')] table = pd.pivot_table(df,values='Weight',aggfunc='count',index='Deposito',columns='Tipo Ctr') table = table.reindex(columns=['20DV', '40DV', '40HC']) table = table.rename(index={'MEDLOG SAN ANTONIO':'SAI','SITRANS SAI ALTO DEPOT':'SAI', 'SITRANS VALPARAISO DEPOT':'VAP','MEDLOG SANTIAGO':'STGO'}) table = table.groupby('Deposito').sum() # print table.iloc[0]['20DV'] import openpyxl import os wb = openpyxl.Workbook() sheet = wb.active list = [] print table data = [] for y in range(len(table.index)): data.append([]) for x in range(len(table.columns)): data[-1].append(table.iloc[y][x]) x = 1 z = 0 for deposit in data: r = 0 sheet.cell(1,x,str(table.index[z])) for value in deposit: sheet.cell(2,x,str(table.columns[r])) sheet.cell(3,x,float(value)) x+=1 r+=1 x+=1 z+=1 wb.save('demo.xlsx') wb.close() import subprocess if (save_location == ""): print "Saving Output in Program Location!" elif (not os.path.exists(save_location)): Print_Error("Save Directory Not Found!") create_directory(save_location) try: table.to_excel(save_location+'/file_output.xlsx') print "Saved Succesfully" except: Print_Error('Error Saving File!') directory = os.getcwd() + '/demo.xlsx' OpenFile(directory) else: Print_Error("File not compatible!")
"--image_topic", dest="image_topic", default=None, help="Use specified image topic.", ) parser.add_argument("--save_stats", action="store_true", help="Save stats to csv file.") parser.add_argument("--make_plots", type=bool, default=True, help="Make pdf of plots of results.") args, args_unkonown = parser.parse_known_args() bag_directory = args.bag_directory if bag_directory == None: bag_directory = os.getcwd() if not os.path.exists(bag_directory): print(("Bag directory {} does not exist.".format(bag_directory))) exit() bag_files = utilities.get_files(bag_directory, "*.bag") print(("Found {} bag files in {}.".format(len(bag_files), bag_directory))) output_dir = utilities.create_directory(args.output_directory) print(("Output directory for results is {}".format(output_dir))) bag_sweep(bag_files, output_dir, args) combine_results_in_csv_file(bag_files, output_dir)
combined_dataframes = pd.DataFrame(None, None, names) for dataframe in dataframes: trimmed_dataframe = pd.DataFrame(dataframe.transpose().values[1:2], columns=names) combined_dataframes = combined_dataframes.append(trimmed_dataframe, ignore_index=True) return combined_dataframes def average_results(directory, csv_files): combined_dataframes = combined_results(csv_files) names = combined_dataframes.columns mean_dataframe = pd.DataFrame() for name in names: mean_dataframe[name] = [combined_dataframes[name].mean()] averaged_results_file = os.path.join(directory, "averaged_results.csv") mean_dataframe.to_csv(averaged_results_file, index=False) # Averages results from all *stats.csv files in a directory (including subdirectories). if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("directory", help="Full path to directory where results files are.") args = parser.parse_args() results_csv_files = utilities.get_files(args.directory, "*stats.csv") if not results_csv_files: print("Failed to find stats.csv files") exit() average_results(args.directory, results_csv_files)
action='store_true', help='Write parameter sweep values to a table in pdf') args = parser.parse_args() if args.write_values_table and not args.write_values: print("If write_values_table enabled, write_values must be as well.") exit() directory = args.directory if directory == None: directory = os.getcwd() dataframes = [] results_filestring = '*results.csv' results_files = utilities.get_files(directory, results_filestring) if len(results_files) == 0: print("No results csv files found in directory " + directory) exit() dataframes.append(plot_creator.load_dataframe(results_files)) if args.write_values: values_filestring = '*values.csv' values_files = utilities.get_files(directory, values_filestring) values_dataframe = plot_creator.load_dataframe(values_files) values_dataframe.columns.name = 'values' dataframes.append(values_dataframe) pdf_filename = 'result_plots.pdf' plot_creator.create_pdf(dataframes, pdf_filename, args.write_values_table,
def load_prime_index(self): file_list = functs.get_files(self.__PRIME_INDEX_DIR) for file_path in file_list: self._merge(file_path)
import datetime import requests import urllib3 import pickle import calendar from collections import defaultdict import numpy as np today_dt = datetime.date.today() yesterday_dt = today_dt - datetime.timedelta(days=1) dates_ds = pd.date_range(inputs.start_date, inputs.end_date) dates = [str(x.date()).replace('-', '/') for x in list(dates_ds)] site = 'breitbart' articles_dir = utilities.data_2018_dir article_srcs = [x for x in utilities.get_files(articles_dir, extensions=['json']) if site in x] blm_dir = utilities.blm_dir blm_html_1pass_dir = utilities.blm_html_1pass_dir # instantiate driver def instantiate_driver(wait=10, url='https://google.com', headless=False): chrome_options = Options() if headless: chrome_options.add_argument("--headless") chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') driver_path = utilities.chromedriver_path driver = webdriver.Chrome(driver_path, chrome_options=chrome_options) driver.get(url)
def main(): # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print( 'Use google custom search api to retrieve on-topic articles for selected site' ) # indicate the site on the google custom search api control panel # enter credentials from config.py file # if this file doesn't exist, create one and define google_custom_search_cx and developerKey variables cx = config.google_custom_search_cx developerKey = config.google_custom_search_developer_key service = build("customsearch", "v1", developerKey=developerKey) # make date ranges begin_mo, end_mo = [ list([ str(x.date()).replace('-', '') for x in pd.date_range('2013-07-01', '2018-08-31', freq=x) ]) for x in ['MS', 'M'] ] dates = [ ':'.join(['date', 'r', x[0], x[1]]) for x in list(zip(begin_mo, end_mo)) ] # define topics topics = ['black lives matter', 'police brutality'] # iterate over each topic, srp, and dates starts = np.arange(1, 100, 10) res_li = [] for topic in topics: for start in starts: for date in dates: if start == 91: num = 9 else: num = 10 res = service.cse().siterestrict().list(q=topic, cx=cx, hl='lang_en', lr='lang_en', num=num, start=start, sort=date).execute() res_li.append(res) if overwrite: with open(res_dst, 'wb') as f: pickle.dump(res_li, f) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Build final dataframe and save to disk') df_res = [] for srp in res_li: try: articles = srp['items'] for article in articles: df_res.append([article['title'], article['link']]) except: pass df_res = pd.DataFrame(df_res, columns=['title', 'link' ]).drop_duplicates().set_index('title') if overwrite: df_res.to_pickle(os.path.join(dst_dir, site + '_blm_links.pkl')) preview(df_res) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Filter out off-topic articles and save that to disk') # combine dictionaries articles_dir = utilities.data_2018_dir article_srcs = [ x for x in utilities.get_files(utilities.data_2018_dir, extensions=['json']) if site in x ] dates_articles_ = utilities.combine_dicts(article_srcs) links = [ x.lower().split('https://www.')[-1] for x in df_res.link.unique().tolist() ] dates_articles = {} for date, days_articles in dates_articles_.items(): articles = [] for article in days_articles: article2 = copy.deepcopy(article) try: link = article2['url'].lower().strip().split('http://www.')[-1] except: link = '' if link in links: articles.append(article2) if len(articles) > 0: dates_articles[date] = articles articles_dst = os.path.join(dst_dir, site + '_articles.pkl') with open(articles_dst, 'wb') as f: pickle.dump(dates_articles, f)
import os import utilities from shutil import copy import csv import numpy as np # Get current answer files ans_fol = "..\\..\\data\\answers" ans_paths = utilities.get_files(ans_fol) subs = [utilities.get_filename(i) for i in ans_paths] # Get export files (iMotion file) exp_fol = "..\\..\\data\\export" exp_paths = utilities.get_files(exp_fol) # Create subject folders if they do not exist sub_fol = "..\\..\\data\\subjects" for ans_path, sub in zip(ans_paths, subs): folder = os.path.join(sub_fol, sub) # Check if folder already exists if not os.path.isdir(folder): # Create folder os.mkdir(folder) # Copy answer file to subject folder copy(ans_path, os.path.join(folder, 'answers.txt')) # Find and copy the correct export file to subject folder exp_ind = utilities.search_string_in_list(exp_paths, sub) copy(exp_paths[exp_ind], os.path.join(folder, 'export.txt'))
def main(): # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # DEFINE INPUTS AND LOAD DATA today_dt = datetime.date.today() yesterday_dt = today_dt - datetime.timedelta(days=1) dates_ds = pd.date_range(inputs.start_date, inputs.end_date) dates = [str(x.date()).replace('-', '/') for x in list(dates_ds)] overwrite = inputs.overwrite site = 'guardian' src = os.path.join(utilities.blm_dir, 'Google_CSE_Results', site + '_articles.pkl') with open(src, 'rb') as f: dates_articles_ = pickle.load(f) interim_dir = os.path.join(utilities.blm_dir, 'z_Interim') utilities.mkdir(interim_dir) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Pull comment URLs') if overwrite: base_url = 'https://www.theguardian.com/discussion' dates_articles = copy.deepcopy(dates_articles_) counter = 0 for date, days_articles in dates_articles.items(): for ix, article in enumerate(days_articles): try: article_url = article['url'].strip().lower() r = requests.get(article_url) article_soup = bs(r.text) comments_div = article_soup.find('div', {'id': 'comments'}) soup_id = comments_div.attrs['data-discussion-key'] comments_url = base_url + soup_id dates_articles[date][ix]['comments_url'] = comments_url except: dates_articles[date][ix]['comments_url'] = 'no comments' dates_articles_dst = os.path.join(interim_dir, 'articles_w_comments_urls.pkl') with open(dates_articles_dst, 'wb') as f: pickle.dump(dates_articles, f) else: with open(dates_articles_dst, 'rb') as f: dates_articles = pickle.load(f) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Remove articles with no comments') dates_articles2 = {} for date, days_articles in dates_articles.items(): articles = [] for article in days_articles: if article['comments_url'] != 'no comments': article_copy = copy.deepcopy(article) articles.append(article_copy) if len(articles) > 0: dates_articles2[date] = articles # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Scrape comments pages') if overwrite: dates_articles3 = {} counter = 0 file_counter = 0 for date, days_articles in dates_articles2.items(): articles3 = [] for article in days_articles: comments_url = article['comments_url'] comments_li = [] try: comments_soup, comments = get_page_comments(comments_url) comments_li.append(comments) next_page_comments_url = get_next_page_url(comments_soup) while next_page_comments_url is not None: try: next_page_comments_soup, next_page_comments = get_page_comments( next_page_comments_url) comments_li.append(next_page_comments) next_page_comments_url = get_next_page_url( next_page_comments_soup) except: next_page_comments_url = None except: pass article3 = copy.deepcopy(article) article3['raw_comments'] = comments_li articles3.append(article3) if len(articles3) > 0: dates_articles3[date] = articles3 counter += 1 if counter >= 10: dst = os.path.join(utilities.blm_html_1pass_dir, site + str(file_counter) + '.pkl') with open(dst, 'wb') as f: pickle.dump(dates_articles3, f) dates_articles3 = {} counter = 0 file_counter += 1 if counter > 0: dst = os.path.join(utilities.blm_html_1pass_dir, site + str(file_counter + 1) + '.pkl') with open(dst, 'wb') as f: pickle.dump(dates_articles3, f) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print('Print outputs') recovered_articles_srcs = [ x for x in utilities.get_files(utilities.blm_html_1pass_dir) if site in x ] recovered_articles = utilities.combine_dicts(recovered_articles_srcs) n_articles = utilities.count_articles(recovered_articles) print('Recovered %s on-topic articles with comments' % n_articles)
import utilities import numpy as np import os import re import sys answer_fol = "..\\..\\data\\answers" save_fol = "..\\..\\data\\subjects" answer_files = utilities.get_files(answer_fol) file = sys.argv[1] # Get subject names from file name sub_name = utilities.get_filename(file, start=4) # Get time zero (t0) of the subject from the info.csv file. # t0 represents the exact ms that the eeg recording started according to iMotions export file with open(os.path.join(save_fol, sub_name, 'info.csv')) as csv: lines = csv.readlines() line = lines[2].split(',') t0 = int(re.sub("[^0-9]", "", line[2])) f = open(file) text = f.readlines() f.close() time_zero = 0 for ind, line in enumerate(text): # Search in text for this particular line which marks the start of the web application if "NavigateComplete\thttp://localhost/exp/main.php" in line: line = text[ind - 2].split('\t') time_zero = int(line[9])