def arxiv_cleanup(working_folder, earliest_date=None, latest_date=None, remove_columns=None): """ Cleans the crawl results from arxiv. :param working_folder: Folder containing the files :type working_folder: str :param remove_columns: Columns to be removed from the crawled dataframe. If none given, default is to remove [u'abstract', u'affiliations',u'link_abstract', u'link_doi', u'link_pdf',u'comment'] :type remove_columns: list of str :param earliest_date: Articles before this date are removed :type earliest_date: datetime :param latest_date: Artivles after this date are removed :type latest_date: datetime :return: None """ config = logging_confdict(working_folder, __name__ + "_cleanup") logging.config.dictConfig(config) arxiv_logger = logging.getLogger(__name__ + "_cleanup") # Read in stage_1 raw file try: stage_1_raw = pd.read_json(working_folder + "/stage_1_raw.json") except Exception, e: arxiv_logger.exception("Could not load stage_1_raw file. Exiting...") sys.exit("Could not load stage_1_raw file")
def crossref_cleanup(working_folder, earliest_date=None, latest_date=None, remove_columns=None): """ Cleans the crawl results from crossref. :param working_folder: Folder containing the files :type working_folder: str :param remove_columns: Columns to be removed from the crawled dataframe. If none given, default is None :type remove_columns: list of str :param earliest_date: Articles before this date are removed :type earliest_date: datetime :param latest_date: Articles after this date are removed :type latest_date: datetime :return: None """ config = logging_confdict(working_folder, __name__ + "_cleanup") logging.config.dictConfig(config) cr_logger = logging.getLogger(__name__ + "_cleanup") # Read in stage_1 raw file try: stage_2_raw = pd.read_json(working_folder + "/stage_2_raw.json") except Exception, e: cr_logger.exception("Could not load stage_1_raw file") sys.exit("Could not load stage 2 raw")
def arxiv_crawl(crawling_list, limit=None, batchsize=100, submission_range=None, update_range=None, delay=None): """ This is a python wrapper for the aRxiv "arxiv_search" function. If submission_range or update_range are given, the results are filtered according to the date ranges. :param crawling_list: The subcategories to crawl. NOT "stat" -> USE "stat.AP" etc... :type crawling_list: dict of lists. :param limit: Max number of results to return. :type limit: int. :param batchsize: Number of queries per request. :type batchsize: int. :param submission_range: The range of submission dates. :type submission_range: Tuple (start,end). :param update_range: The range of last-update dates. :type update_range: Tuple (start,end). :returns: The created folder """ # Timestamp of starting datetime ts_start = time.time() timestamp = datetime.datetime.fromtimestamp(ts_start).strftime( '%Y-%m-%d_%H-%M-%S') # Create folder structure working_folder = base_directory + timestamp os.makedirs(working_folder) os.makedirs(working_folder + "/temp_files") # Setup logging config = logging_confdict(working_folder, __name__) logging.config.dictConfig(config) arxiv_logger = logging.getLogger(__name__) arxiv_logger.info("Starting new crawl for {}".format(str(crawling_list))) arxiv_logger.info("Created new folder: <<" + working_folder + ">>") # Load R-scripts arxiv_logger.debug("Loading R-Scripts ...") try: with open('../r_scripts/arxiv.R', 'r') as f: string = ''.join(f.readlines()) arxiv_crawler = SignatureTranslatedAnonymousPackage( string, "arxiv_crawler") except Exception, e: arxiv_logger.exception("Error while loading R-Scripts.") sys.exit('Could not load R-Scripts!')
def test_merge(timestamp): """ Call manually if automatic merging of json files fails. :param timestamp: The timestamp of the crawl process that failed to merge the temporary json :return: <str> - Working folder """ working_folder = base_directory + timestamp config = logging_confdict(working_folder, __name__) logging.config.dictConfig(config) arxiv_logger = logging.getLogger(__name__) from path import Path temp_files = list(Path(working_folder + "/temp_files/").files("*.json")) try: temp_jsons = [] for idx, temp_file in enumerate(temp_files): arxiv_logger.debug(temp_file) with open(temp_file) as data_file: temp = json.load(data_file) temp_jsons.append(temp) temp_json = temp_jsons[0] for d in temp_jsons[1:-1]: for key, val_dict in d.items(): new_dict = {} offset = len(temp_json[key].values()) for doc_id in val_dict.keys(): new_doc_id = offset + int(doc_id) new_dict[new_doc_id] = val_dict.pop(doc_id) temp_json[key].update(new_dict) print("Length of concatenated dataset: ", len(temp_json['id'].keys())) result_df = pd.DataFrame.from_dict(temp_json) result_df.index = range(0, len(result_df.index)) result_df = result_df.fillna(np.nan) result_df.to_json(working_folder + "/stage_1_raw.json") except: arxiv_logger.exception( "Error during concatenation of temporary objects") return working_folder
def crossref_crawl(num_processes=1, num_threads=1, input_folder=None, continue_folder=None): """ DOI Lookup interfaces to different DOI providers. Currently implemented: CrossRef. To-Do: DataCite Stage 1 dataset is split into equally sized subframes. Each is given to a subprocess that accesses the crossref API with multiple threads. Possible candidate documents are matched with original arxiv-documents using Levenshtein Ratio (Schloegl et al) :param num_processes: Number of processes to split the initial stage_1_dataset :param num_threads: Number of threads each process uses to access crossref API :param input_folder: The folder containing the stage 1 data. If not given, the most recent folder will be used to work :returns: pd.DataFrame - newly found DOIs with original indices """ ts_start = time.time() timestamp = datetime.datetime.fromtimestamp(ts_start).strftime( '%Y-%m-%d_%H-%M-%S') # Create folder structure if not input_folder: all_subdirs = [d for d in Path(base_directory).listdir() if d.isdir()] latest_subdir = max(all_subdirs, key=Path.getmtime) base_folder = latest_subdir + "/" else: base_folder = input_folder if base_folder[-1] != "/": base_folder += "/" if continue_folder: working_folder = continue_folder temp_folder = working_folder + "/temp/" else: working_folder = base_folder + timestamp temp_folder = working_folder + "/temp/" Path(working_folder).mkdir() Path(temp_folder).mkdir() skip_indices = set() if continue_folder: # Setup logging config = logging_confdict(working_folder, __name__) logging.config.dictConfig(config) cr_logger = logging.getLogger(__name__) cr_logger.info("Continuing crawl in <<" + working_folder + ">>") for temp_file in Path(temp_folder).files("*.csv"): with open(temp_file, "rb") as tempfile: r = csv.reader(tempfile, delimiter=";") for line in r: if len(line) == 6: if line[-1] == "False" or line[-1] == "True": skip_indices.add(int(line[0])) else: # Setup logging config = logging_confdict(working_folder, __name__) logging.config.dictConfig(config) cr_logger = logging.getLogger(__name__) cr_logger.info("\nCreated new folder: <<" + working_folder + ">>") # Read in stage 1 file cr_logger.debug("\nReading in stage_1.json ... (Might take a few seconds)") try: stage_1 = pd.read_json(base_folder + "/stage_1.json") except: cr_logger.exception("Problem occured while reading ") sys.exit("Could not read stage_1 file") stage_1.sort_index(inplace=True) stage_1['submitted'] = pd.to_datetime(stage_1['submitted'], unit="ms") stage_1.index = range(0, len(stage_1.index)) crawl_stage_1 = stage_1.drop(skip_indices) cr_logger.info( "\nSpawning {} processes - output will be cluttered... :S\n".format( num_processes)) # Split df into n sub-dataframes for n processes df_ranges = range(0, len(crawl_stage_1.index), len(crawl_stage_1.index) // num_processes + 1) df_ranges = df_ranges + [len(crawl_stage_1.index)] pool_args = [] if len(df_ranges) == 1: indices = [] authors = [] titles = [] submitted = [] pool_args.append([indices, authors, titles, submitted]) else: for idx in range(num_processes): cr_logger.info("Starting process {}".format(idx)) indices = crawl_stage_1.iloc[range( df_ranges[idx], df_ranges[idx + 1])].index.values authors = crawl_stage_1.iloc[range(df_ranges[idx], df_ranges[idx + 1])].authors titles = crawl_stage_1.iloc[range(df_ranges[idx], df_ranges[idx + 1])].title submitted = crawl_stage_1.iloc[range(df_ranges[idx], df_ranges[idx + 1])].submitted pool_args.append([indices, authors, titles, submitted]) pool = mp.Pool(processes=num_processes) for x in pool_args: pool.apply_async(crossref_lookup, args=(working_folder, x[0], x[1], x[2], x[3], num_threads)) pool.close() pool.join() cr_logger.info("All processes finished") output = [] for temp_file in Path(temp_folder).files("*.csv"): with open(temp_file, "rb") as tempfile: r = csv.reader(tempfile, delimiter=";") for line in r: if len(line) == 6: result = { 'idx': int(line[0]), 'cr_title': line[1], 'cr_doi': line[3], 'lr': line[4] } if line[-1] == "False": result['cr_title'] = np.nan result['cr_doi'] = np.nan output.append(result) cr_data = pd.DataFrame(output) cr_data = cr_data.set_index("idx", drop=True) cr_logger.info("\nMerging stage_1 dataset and crossref results") stage_2_raw = pd.merge(stage_1, cr_data, left_index=True, right_index=True, how='left') print(stage_2_raw) stage_2_raw.sort_index(inplace=True) try: stage_2_raw.to_json(working_folder + "/stage_2_raw.json") stage_2_raw.to_csv(working_folder + "/stage_2_raw.csv", encoding="utf-8", sep=Config.get("csv", "sep_char"), index=False) except Exception, e: cr_logger.exception("Could not write all output files")
def mendeley_crawl(stage1_dir=None, stage2_dir=None, num_threads=1): """ Retrieve mendeley documents based on arxiv id and dois. If both arxiv and doi yield different mendeley documents the one with more identifiers is preferred. :param stage1_dir: The name of the Stage 1 folder to use. If None last created will be used :param stage2_dir: The name of the Stage 2 folder to use. If None last created will be used :param num_threads: Number of threads to use :return: working_folder as absolute path """ ts_start = time.time() timestamp = datetime.datetime.fromtimestamp(ts_start).strftime('%Y-%m-%d_%H-%M-%S') # Start mendeley session session = start_mendeley_session(Config._sections['mndly_auth']) # Create folder structure if not stage1_dir: all_subdirs = [base_directory + d for d in os.listdir(base_directory) if os.path.isdir(base_directory + d)] latest_subdir = max(all_subdirs, key=os.path.getmtime) stage1_dir = latest_subdir + "/" else: stage1_dir += "/" if not stage2_dir: all_subdirs = [stage1_dir + d for d in os.listdir(stage1_dir) if os.path.isdir(stage1_dir + d)] latest_subdir = max(all_subdirs, key=os.path.getmtime) stage2_dir = latest_subdir + "/" else: stage2_dir = stage1_dir + stage2_dir + "/" working_folder = stage2_dir + timestamp if not os.path.exists(working_folder): os.makedirs(working_folder) # Create logger config = logging_confdict(working_folder, __name__) logging.config.dictConfig(config) logger = logging.getLogger(__name__) # Read in stage 2 file input_df = pd.read_json(stage2_dir + "stage_2.json") input_df.sort_index(inplace=True) input_q = Queue.Queue() output_q = Queue.Queue() for idx, row in input_df.iterrows(): input_q.put((idx, row)) mndly_threads = [] for i in range(0, num_threads): thread = MendeleyThread(logger, input_q, output_q, len(input_df.index), session) thread.start() mndly_threads.append(thread) for thread in mndly_threads: thread.join() output_dicts = [] while not output_q.empty(): output_dicts.append(output_q.get_nowait()) # ================= TEMPORARY HACK ============== arxiv_ids = [] for original_arxiv in input_df['id'].values: found_regex = regex_new_arxiv.findall(original_arxiv) if found_regex: arxiv_id = found_regex[0] else: found_regex = regex_old_arxiv.findall(original_arxiv) if found_regex: arxiv_id = found_regex[0] else: arxiv_id = "parse_failed" arxiv_ids.append(arxiv_id) input_df['arxiv_id'] = pd.Series(arxiv_ids, index=input_df.index) # ================= TEMPORARY HACK ============== stage_3_raw = pd.DataFrame(output_dicts) stage_3_raw = pd.merge(left=input_df, right=stage_3_raw, left_on="arxiv_id", right_on="arxiv_id", how="outer") stage_3_raw['submitted'] = pd.to_datetime(stage_3_raw['submitted'], unit="ms") stage_3_raw['updated'] = pd.to_datetime(stage_3_raw['updated'], unit="ms") del stage_3_raw['abstract'] try: stage_3_raw.to_json(working_folder + "/stage_3_raw.json") stage_3_raw.to_csv(working_folder + "/stage_3_raw.csv", encoding="utf-8", sep=Config.get("csv", "sep_char"), index=False) except Exception, e: logger.exception("Could not write all output files")
def ads_crawl_category(list_of_cats, number_of_docs=100, num_threads=1): """ :param list_of_cats: <list> - Categories to crawl :param number_of_docs: <int> - Number of docs to crawl. :param num_threads: <int> - Number of ADS-Crawl threads to start :return: <str> - Working folder """ timestamp = arrow.utcnow().to('Europe/Vienna').format('YYYY-MM-DD_HH-mm-ss') base_folder = base_directory working_folder = base_folder + timestamp Path(working_folder).mkdir() # Setup logging config = logging_confdict(working_folder, __name__) logging.config.dictConfig(config) ads_logger = logging.getLogger(__name__) ads_logger.info("\nCreated new folder: <<" + working_folder + ">>") input_queue = Queue.Queue() output_queue = Queue.Queue() for count, cat in enumerate(list_of_cats): payload = {'q': 'arxiv_class:"{}"'.format(cat), 'sort': 'read_count desc', 'fl': 'reader,title,abstract,' 'year,author,pub,read_count,' 'citation_count,identifier,arxiv_class,' 'primary_arxiv_class,arxiv_primary_class,' 'primary_class', 'rows': number_of_docs} input_queue.put((count, payload)) threads = [] for i in range(num_threads): thread = ADSThread(input_queue, output_queue, ads_logger) thread.start() threads.append(thread) ads_logger.debug("THREADING STARTED - PLEASE BE PATIENT") for thread in threads: thread.join() rows = [] while not output_queue.empty(): temp = output_queue.get_nowait() for doc in temp: # doc['url'] = "http://arxiv.org/abs/" + cat try: doc['authors'] = ";".join(doc['author']) del doc['author'] except KeyError: doc['authors'] = [] if 'reader' not in doc: doc['reader'] = [] doc['readers'] = int(doc['read_count']) doc['reader_ids'] = u";".join(doc['reader']) doc['title'] = doc['title'][0] del doc['read_count'] del doc['reader'] rows.append(doc) # Convert to pandas dataframe df = pd.DataFrame(rows) # Rename columns df.rename(columns={'pub': 'published_in', 'abstract': 'paper_abstract'}, inplace=True) df.index.name = "id" # Output # ads_logger.debug("SAVING FILE") # df.to_csv(working_folder + "/ads_data.csv", sep=";", encoding='utf8', index=False) # df.to_json(working_folder + "/ads_data.json") return working_folder
def ads_crawl_dataset(input_folder=None, number_of_docs=100, num_threads=1): """ Uses an existing dataframe containing arxiv_id's to crawl corresponding ADS data. Always uses the top *number_of_docs* documents for the search. :param input_folder: Input folder :param number_of_docs: Number of documents to use :param num_threads: Number of threads :return: Newly created working folder """ timestamp = arrow.utcnow().to('Europe/Vienna').format('YYYY-MM-DD_HH-mm-ss') # Create folder structure if not input_folder: all_subdirs = [d for d in Path(base_directory).listdir() if d.isdir()] latest_subdir = max(all_subdirs, key=Path.getmtime) base_folder = latest_subdir + "/" else: # base_folder = base_directory + input_folder base_folder = input_folder if base_folder[-1] != "/": base_folder += "/" working_folder = base_folder + timestamp Path(working_folder).mkdir() # Setup logging config = logging_confdict(working_folder, __name__) logging.config.dictConfig(config) ads_logger = logging.getLogger(__name__) ads_logger.info("\nCreated new folder: <<" + working_folder + ">>") # Read in stage 1 file ads_logger.debug("\nReading in stage_3_raw.json ... (Might take a few seconds)") try: df = pd.read_json(base_folder + "/stage_3_raw.json") except IOError: ads_logger.exception("stage_3_raw.json does not exist") sys.exit() df.sort(columns="reader_count", ascending=False, inplace=True) df.index = range(0, len(df.index)) arxiv_ids = df['arxiv_id'][0:number_of_docs].tolist() input_queue = Queue.Queue() output_queue = Queue.Queue() for count, arxiv_id in enumerate(arxiv_ids): found_regex = regex_new_arxiv.findall(arxiv_id) if found_regex: arxiv_id = found_regex[0] else: found_regex = regex_old_arxiv.findall(arxiv_id) if found_regex: arxiv_id = found_regex[0] payload = {'q': 'arXiv:{}'.format(arxiv_id), 'sort': 'read_count desc'} input_queue.put((count, payload)) threads = [] for i in range(num_threads): thread = ADSThread(input_queue, output_queue, ads_logger) thread.start() threads.append(thread) for thread in threads: thread.join() rows = [] while not output_queue.empty(): temp = output_queue.get_nowait()[0] temp['url'] = "http://arxiv.org/abs/" + "none_currently" try: temp['authors'] = ";".join(temp['author']) del temp['author'] except KeyError: temp['authors'] = [] if 'reader' not in temp: temp['reader'] = [] temp['readers'] = int(temp['read_count']) temp['reader_ids'] = u";".join(temp['reader']) temp['title'] = temp['title'][0] del temp['read_count'] del temp['reader'] rows.append(temp) # Convert to pandas dataframe df = pd.DataFrame(rows) # Rename columns df.rename(columns={'pub': 'published_in', 'abstract': 'paper_abstract'}, inplace=True) df.index.name = "id" # Output df.to_csv(working_folder + "/ads_data.csv", Config.get("csv", "sep_char"), encoding='utf8', index=False) df.to_json(working_folder + "/ads_data.json") return working_folder