def crawl(): print("arxivscrapper v1.6") cat = request.args.get('c') date_from = request.args.get('from') date_to = request.args.get('to') start = int(request.args.get("start", 0)) limit = int(request.args.get("limit", 20)) proxy = request.args.get("proxy" "") scraper = arxivscraper.Scraper(category=cat, date_from=date_from, date_until=date_to) if proxy is not "": print("setting proxy: ", proxy) scraper.setProxy(proxy) # example: http://xx.xx.xx.xx:80/ try: # print("fetching category: "+cat+", from: "+date_from+", to: "+date_to) return Response(generate(scraper, limit), content_type='application/json') except Exception as e: return jsonify({error: e.message}), status.HTTP_500_INTERNAL_SERVER_ERROR
def main(): print("running...") try: opts, args = getopt.getopt(sys.argv[1:], "h", ["help"]) except getopt.GetoptError as err: print(err) print("for help use --help") sys.exit(2) # process options for o, a in opts: if o in ("-h", "--help"): print( "example: python scrapper.py ['category'] [date_from:yyyy-mm-dd] [date_to:yyyy-mm-dd]" ) sys.exit(0) cat = "" if len(args) >= 1: cat = args[0] date_from = "" date_to = "" if len(args) >= 2: date_from = args[1] if len(args) >= 3: date_to = args[2] print("fetching category: " + cat + ", from: " + date_from + ", to: " + date_to) scraper = arxivscraper.Scraper(category=cat, date_from=date_from, date_until=date_to) ds = scraper.scrape() for row in ds: print(row.output())
A python program to retrieve records from ArXiv.org for Social Media Mining course at Arizona State University. Author: Jie Zhang. ''' # To use arxivcraper directly in our script, # we need to import it first. import arxivscraper # Create a scraper to fetch all eprints in category='math' # Filtered by subcategory='math.DG' --- DG='Differential Geometry' # Time span: 1 year starting with most recent scraper = arxivscraper.Scraper(category='math', date_from='2015-09-01', date_until='2016-08-31', t=30, filters={'categories': ['math.DG']}) #scraper = arxivscraper.Scraper(category='math',date_from='2015-09-01',date_until='2016-08-31',t=30) # Use the previously created instance of scraper to scrape the website output = scraper.scrape() # Save the output to required/desired format import pandas as pd #cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') cols = ('id', 'title', 'categories', 'authors') df = pd.DataFrame(output, columns=cols) df.to_csv('scrape2.txt', header=None, index=None, sep='$', mode='a')
titles = [] ids = [] authors = [] categories = [] abstracts = [] dates = [] urls = [] print(parent_category, download_dates) for i in tqdm(range(len(download_dates) - 1)): st_date = download_dates[i] end_date = download_dates[i + 1] for cat in parent_category: try: scraper = arxivscraper.Scraper(category=cat['type'], date_from=st_date, date_until=end_date) outputs = scraper.scrape() ref_set = set(cat['sub-type']) for out in tqdm(outputs): _set = set(out['categories'].split(" ")) if len(ref_set - _set) != len(ref_set): titles.append(out['title']) ids.append(out['id']) authors.append(out['authors']) categories.append(out['categories']) abstracts.append(out['abstract']) dates.append(out['created']) urls.append(out['url']) except:
import arxivscraper import pandas as pd import pickle if __name__ == "__main__": data_path = "/data4/dheeraj/metaguide/" scraper = arxivscraper.Scraper(category='cs', date_from='2014-06-06', date_until='2019-06-07', timeout=86400) output = scraper.scrape() filtered_output = [] for o in output: if len(o["categories"].strip().split()) > 1: continue filtered_output.append(o) cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') df = pd.DataFrame(filtered_output, columns=cols) print("Length of Dataframe", len(df)) with open(data_path + "df_cs_2014.pkl", "wb") as f: pickle.dump(df, f) df.to_csv(data_path + "cs_2014.csv") pass
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Sep 18 21:46:36 2018 @author: ronak """ import arxivscraper import pandas as pd import numpy as np scraper = arxivscraper.Scraper(category='stat', date_from='2017-07-21', date_until='2017-08-10', t=10, filters={ 'categories': ['stat.ap'], 'abstract': ['learning'] }) output = scraper.scrape() cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') df = pd.DataFrame(output, columns=cols) adjacency_list = {} def add_to_adList(auth_list): for authors in auth_list: for author in authors:
'physics:quant-ph' : 'Quantum Physics', 'math' : 'Mathematics', 'stat' : 'Statistics', 'q-bio' : 'Quantitative Biology', 'q-fin' : 'Quantitative Finance', } cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') for categoria in cats: for anio in range(2000, 2020): fecha_inicio = str(anio) + '-01-01' fecha_fin = str(anio) + '-12-31' try: scraper = arxivscraper.Scraper(category = categoria, date_from = fecha_inicio, date_until = fecha_fin, t = 30) output = scraper.scrape() df = pd.DataFrame(output, columns = cols) df['categoria'] = cats[categoria] df['indice'] = df.index vals = df.authors.values.tolist() rs = [len(r) for r in vals] a = np.repeat(df.index, rs) df_auth = pd.DataFrame(np.column_stack((a, np.concatenate(vals))), columns = ['indice', 'authors']) df_auth['indice'] = df_auth.indice.astype(int) df_result = pd.merge(df.drop(['authors'], axis = 1), df_auth, on = ['indice'], how = 'inner') folder_path = 'datos/' + categoria.replace(':', '_') + '/' if not os.path.exists(folder_path): os.makedirs(folder_path)
import arxivscraper import pandas as pd scraper = arxivscraper.Scraper(category='stat', date_from='2002-09-01', date_until='2017-09-01', filters={'categories': ['stat.ML']}) output = scraper.scrape() cols = ('id', 'authors') df = pd.DataFrame(output, columns=cols) print len(df) df.to_csv('~/Data/ml_15year.csv')
import arxivscraper import pandas as pd import networkx as nx #STEP 1 Scrape ################################################### print('starting scraper') scraper = arxivscraper.Scraper(category='cs', date_from='2017-05-29', date_until='2017-06-01') output = scraper.scrape() cols = ('title', 'authors') df = pd.DataFrame(output, columns=cols) #df.to_csv('out.csv', sep=',') #df = pd.read_csv('out.csv') #STEP 2 Social Network Creation and Visualization ################################################### ##convert col to list of authors per book #indexs for 2 things combins while making combos import numpy as np from itertools import combinations #authors = df['authors'] authors = df.authors.astype(str) title = df['title'] #print (type(authors)) #nparray authors = authors.str.replace('[', '')
# import the required packages import numpy as np import pandas as pd import arxivscraper import matplotlib.pyplot as plt import networkx as nx import collections ### Part 1: Scraping the data from arxiv # Create scraper, scrape arxiv database, store output as Pandas data frame scraper = arxivscraper.Scraper(category='physics:astro-ph', date_from='2017-04-24', date_until='2017-05-05') output = scraper.scrape() cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') df = pd.DataFrame(output, columns=cols) #df.head() ### Part 2: Create the network G = nx.Graph() df2 = df[['authors']] #df2.head() m = 0 n = 0 G.clear() numrows = df2.shape[0] # add each author name as a node for i in range(numrows):
exit(1) # Set inputs for retrieving arXiv records category = sys.argv[1] date_from = sys.argv[2] date_until = str(datetime.date.today()) graph_name = "graph_{}_{}.xml.gz".format(category, date_from) co_auth_graph = Graph(directed=False) try: co_auth_graph.load(graph_name) except: print("[DEBUG] Graph data does not exist. Scraping ArXiv!") # Retrieve the records scraper = arxivscraper.Scraper(category=category, date_from=date_from, date_until=date_until) output = scraper.scrape() # Store it in a panda dataframe cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') df = pandas.DataFrame(output, columns=cols) # Create an adj list for authorship co_auth_adj_list = {} for author_list in df['authors']: for u in author_list: for v in author_list: if not u == v: try:
#scrapes arxiv import arxivscraper import pandas as pd #orig was 2018- 5/27 scraper = arxivscraper.Scraper(category='cs',date_from='2019-5-01',date_until='2019-10-26',t=1, filters={'categories':['cs.lg']}) output = scraper.scrape() cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') df = pd.DataFrame(output,columns=cols) #save to a file df.to_csv('path2.csv') # this is good
def axir_data(): f = open('categories.txt', 'r') f = f.read().split('\n') for i in range(0, len(f)): f[i] = f[i].split('\t') for j in range(f[i].count('')): try: f[i].remove('') except: pass #print (f) output = [] for i in f: print(i[1]) scraper = arxivscraper.Scraper( category=i[1], date_from=str( (datetime.datetime.now() - datetime.timedelta(1)).date()), date_until=str(datetime.datetime.now().date())) output.append(scraper.scrape()) cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors') df = pd.DataFrame([], columns=cols) for i in output: try: df = df.append(pd.DataFrame(i, columns=cols)) except: pass df.reset_index(drop=True, inplace=True) df = df.rename(columns={'abstract': 'Abstract'}) df = df.rename(columns={'created': 'Date'}) df = df.rename(columns={'title': 'Title'}) df['Types'] = 'academic' df['Site'] = 'arxiv' df['Source'] = None for i in range(len(df.authors)): a = '' for j in df.authors[i]: a = a + j a = a + ', ' a = a[:-2] df.authors[i] = a df = df.rename(columns={'authors': 'Authors'}) var = [] for i in range(len(df.id)): u = 'https://arxiv.org/abs/' var.append(u + df.id[i]) for i in range(len(df.doi)): df.doi[i] = 'http://doi.org/' + df.doi[i] df = df.rename(columns={'doi': 'Ref'}) df['Url'] = var var = [] for i in range(len(df.id)): u = 'https://arxiv.org/pdf/' var.append(u + df.id[i]) df['Pdf_url'] = var #print (df) df = df.where(pd.notnull(df), np.nan) for i in df.index: try: t = pd.DataFrame() t = t.append(df.loc[i]) t.reset_index(drop=True, inplace=True) try: count = search(t.loc[0]['Title'], t.loc[0]['Site']) print(count) if count < 25: test = t.loc[0].to_json() send_data(test, t.loc[0]['Site']) print('Data sent') else: print('Skipped') except: test = t.loc[0].to_json() send_data(test, t.loc[0]['Site']) except Exception as e: print(e) print('info fetched')