Exemplo n.º 1
0
def crawl():
    print("arxivscrapper v1.6")
    cat = request.args.get('c')
    date_from = request.args.get('from')
    date_to = request.args.get('to')
    start = int(request.args.get("start", 0))
    limit = int(request.args.get("limit", 20))
    proxy = request.args.get("proxy" "")

    scraper = arxivscraper.Scraper(category=cat,
                                   date_from=date_from,
                                   date_until=date_to)
    if proxy is not "":
        print("setting proxy: ", proxy)
        scraper.setProxy(proxy)  # example: http://xx.xx.xx.xx:80/

    try:
        # print("fetching category: "+cat+", from: "+date_from+", to: "+date_to)
        return Response(generate(scraper, limit),
                        content_type='application/json')

    except Exception as e:
        return jsonify({error:
                        e.message}), status.HTTP_500_INTERNAL_SERVER_ERROR
Exemplo n.º 2
0
def main():
    print("running...")
    try:
        opts, args = getopt.getopt(sys.argv[1:], "h", ["help"])
    except getopt.GetoptError as err:
        print(err)
        print("for help use --help")
        sys.exit(2)
    # process options

    for o, a in opts:
        if o in ("-h", "--help"):
            print(
                "example: python scrapper.py ['category'] [date_from:yyyy-mm-dd] [date_to:yyyy-mm-dd]"
            )
            sys.exit(0)

    cat = ""
    if len(args) >= 1:
        cat = args[0]
    date_from = ""
    date_to = ""
    if len(args) >= 2:
        date_from = args[1]

    if len(args) >= 3:
        date_to = args[2]

    print("fetching category: " + cat + ", from: " + date_from + ", to: " +
          date_to)
    scraper = arxivscraper.Scraper(category=cat,
                                   date_from=date_from,
                                   date_until=date_to)
    ds = scraper.scrape()
    for row in ds:
        print(row.output())
A python program to retrieve records from ArXiv.org for
Social Media Mining course at Arizona State University.

Author: Jie Zhang.
'''

# To use arxivcraper directly in our script,
# we need to import it first.
import arxivscraper

# Create a scraper to fetch all eprints in category='math'
# Filtered by subcategory='math.DG' --- DG='Differential Geometry'
# Time span: 1 year starting with most recent
scraper = arxivscraper.Scraper(category='math',
                               date_from='2015-09-01',
                               date_until='2016-08-31',
                               t=30,
                               filters={'categories': ['math.DG']})
#scraper = arxivscraper.Scraper(category='math',date_from='2015-09-01',date_until='2016-08-31',t=30)

# Use the previously created instance of scraper to scrape the website
output = scraper.scrape()

# Save the output to required/desired format
import pandas as pd
#cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')
cols = ('id', 'title', 'categories', 'authors')
df = pd.DataFrame(output, columns=cols)

df.to_csv('scrape2.txt', header=None, index=None, sep='$', mode='a')
Exemplo n.º 4
0
titles = []
ids = []
authors = []
categories = []
abstracts = []
dates = []
urls = []
print(parent_category, download_dates)
for i in tqdm(range(len(download_dates) - 1)):
    st_date = download_dates[i]
    end_date = download_dates[i + 1]
    for cat in parent_category:
        try:
            scraper = arxivscraper.Scraper(category=cat['type'],
                                           date_from=st_date,
                                           date_until=end_date)
            outputs = scraper.scrape()

            ref_set = set(cat['sub-type'])
            for out in tqdm(outputs):
                _set = set(out['categories'].split(" "))
                if len(ref_set - _set) != len(ref_set):
                    titles.append(out['title'])
                    ids.append(out['id'])
                    authors.append(out['authors'])
                    categories.append(out['categories'])
                    abstracts.append(out['abstract'])
                    dates.append(out['created'])
                    urls.append(out['url'])
        except:
Exemplo n.º 5
0
import arxivscraper
import pandas as pd
import pickle

if __name__ == "__main__":
    data_path = "/data4/dheeraj/metaguide/"
    scraper = arxivscraper.Scraper(category='cs',
                                   date_from='2014-06-06',
                                   date_until='2019-06-07',
                                   timeout=86400)
    output = scraper.scrape()
    filtered_output = []
    for o in output:
        if len(o["categories"].strip().split()) > 1:
            continue
        filtered_output.append(o)
    cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created',
            'updated', 'authors')
    df = pd.DataFrame(filtered_output, columns=cols)
    print("Length of Dataframe", len(df))
    with open(data_path + "df_cs_2014.pkl", "wb") as f:
        pickle.dump(df, f)

    df.to_csv(data_path + "cs_2014.csv")
    pass
Exemplo n.º 6
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 18 21:46:36 2018

@author: ronak
"""
import arxivscraper
import pandas as pd
import numpy as np

scraper = arxivscraper.Scraper(category='stat',
                               date_from='2017-07-21',
                               date_until='2017-08-10',
                               t=10,
                               filters={
                                   'categories': ['stat.ap'],
                                   'abstract': ['learning']
                               })
output = scraper.scrape()

cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated',
        'authors')
df = pd.DataFrame(output, columns=cols)

adjacency_list = {}


def add_to_adList(auth_list):
    for authors in auth_list:
        for author in authors:
Exemplo n.º 7
0
        'physics:quant-ph' : 'Quantum Physics',
        'math' : 'Mathematics',
        'stat' : 'Statistics',
        'q-bio' : 'Quantitative Biology',
        'q-fin' : 'Quantitative Finance',
        }

cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')


for categoria in cats:
    for anio in range(2000, 2020):
        fecha_inicio = str(anio) + '-01-01'
        fecha_fin = str(anio) + '-12-31'
        try:
            scraper = arxivscraper.Scraper(category = categoria, date_from = fecha_inicio, date_until = fecha_fin, t = 30)
            output = scraper.scrape()
            df = pd.DataFrame(output, columns = cols)
            df['categoria'] = cats[categoria] 
            df['indice'] = df.index
            vals = df.authors.values.tolist()
            rs = [len(r) for r in vals]
            a = np.repeat(df.index, rs)
            df_auth = pd.DataFrame(np.column_stack((a, np.concatenate(vals))), columns = ['indice', 'authors'])
            df_auth['indice'] = df_auth.indice.astype(int)
            df_result = pd.merge(df.drop(['authors'], axis = 1), df_auth, on = ['indice'], how = 'inner')  
                
            folder_path = 'datos/' + categoria.replace(':', '_') + '/'
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
                    
Exemplo n.º 8
0
import arxivscraper
import pandas as pd

scraper = arxivscraper.Scraper(category='stat',
                               date_from='2002-09-01',
                               date_until='2017-09-01',
                               filters={'categories': ['stat.ML']})

output = scraper.scrape()

cols = ('id', 'authors')
df = pd.DataFrame(output, columns=cols)
print len(df)
df.to_csv('~/Data/ml_15year.csv')
Exemplo n.º 9
0
import arxivscraper
import pandas as pd
import networkx as nx

#STEP 1 Scrape
###################################################
print('starting scraper')
scraper = arxivscraper.Scraper(category='cs',
                               date_from='2017-05-29',
                               date_until='2017-06-01')
output = scraper.scrape()
cols = ('title', 'authors')
df = pd.DataFrame(output, columns=cols)

#df.to_csv('out.csv', sep=',')
#df = pd.read_csv('out.csv')

#STEP 2 Social Network Creation and Visualization
###################################################

##convert col to list of authors per book
#indexs for 2 things combins while making combos
import numpy as np
from itertools import combinations

#authors = df['authors']
authors = df.authors.astype(str)
title = df['title']

#print (type(authors)) #nparray
authors = authors.str.replace('[', '')
Exemplo n.º 10
0
# import the required packages
import numpy as np
import pandas as pd
import arxivscraper
import matplotlib.pyplot as plt
import networkx as nx
import collections

### Part 1: Scraping the data from arxiv
# Create scraper, scrape arxiv database, store output as Pandas data frame

scraper = arxivscraper.Scraper(category='physics:astro-ph',
                               date_from='2017-04-24',
                               date_until='2017-05-05')
output = scraper.scrape()
cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated',
        'authors')
df = pd.DataFrame(output, columns=cols)
#df.head()

### Part 2: Create the network
G = nx.Graph()
df2 = df[['authors']]
#df2.head()
m = 0
n = 0
G.clear()
numrows = df2.shape[0]

# add each author name as a node
for i in range(numrows):
        exit(1)

    # Set inputs for retrieving arXiv records
    category = sys.argv[1]
    date_from = sys.argv[2]
    date_until = str(datetime.date.today())

    graph_name = "graph_{}_{}.xml.gz".format(category, date_from)
    co_auth_graph = Graph(directed=False)
    try:
        co_auth_graph.load(graph_name)
    except:
        print("[DEBUG] Graph data does not exist. Scraping ArXiv!")
        # Retrieve the records
        scraper = arxivscraper.Scraper(category=category,
                                       date_from=date_from,
                                       date_until=date_until)
        output = scraper.scrape()

        # Store it in a panda dataframe
        cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created',
                'updated', 'authors')
        df = pandas.DataFrame(output, columns=cols)

        # Create an adj list for authorship
        co_auth_adj_list = {}
        for author_list in df['authors']:
            for u in author_list:
                for v in author_list:
                    if not u == v:
                        try:
#scrapes arxiv

import arxivscraper
import pandas as pd

#orig was 2018- 5/27
scraper = arxivscraper.Scraper(category='cs',date_from='2019-5-01',date_until='2019-10-26',t=1, filters={'categories':['cs.lg']})

output = scraper.scrape()

cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')
df = pd.DataFrame(output,columns=cols)

#save to a file
df.to_csv('path2.csv') # this is good
Exemplo n.º 13
0
def axir_data():
    f = open('categories.txt', 'r')
    f = f.read().split('\n')

    for i in range(0, len(f)):
        f[i] = f[i].split('\t')
        for j in range(f[i].count('')):
            try:
                f[i].remove('')
            except:
                pass
    #print (f)
    output = []
    for i in f:
        print(i[1])
        scraper = arxivscraper.Scraper(
            category=i[1],
            date_from=str(
                (datetime.datetime.now() - datetime.timedelta(1)).date()),
            date_until=str(datetime.datetime.now().date()))
        output.append(scraper.scrape())

    cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created',
            'updated', 'authors')
    df = pd.DataFrame([], columns=cols)
    for i in output:
        try:
            df = df.append(pd.DataFrame(i, columns=cols))
        except:
            pass
    df.reset_index(drop=True, inplace=True)

    df = df.rename(columns={'abstract': 'Abstract'})
    df = df.rename(columns={'created': 'Date'})
    df = df.rename(columns={'title': 'Title'})
    df['Types'] = 'academic'
    df['Site'] = 'arxiv'
    df['Source'] = None

    for i in range(len(df.authors)):
        a = ''
        for j in df.authors[i]:
            a = a + j
            a = a + ', '
        a = a[:-2]
        df.authors[i] = a
    df = df.rename(columns={'authors': 'Authors'})

    var = []
    for i in range(len(df.id)):
        u = 'https://arxiv.org/abs/'
        var.append(u + df.id[i])

    for i in range(len(df.doi)):
        df.doi[i] = 'http://doi.org/' + df.doi[i]

    df = df.rename(columns={'doi': 'Ref'})

    df['Url'] = var

    var = []
    for i in range(len(df.id)):
        u = 'https://arxiv.org/pdf/'
        var.append(u + df.id[i])

    df['Pdf_url'] = var
    #print (df)

    df = df.where(pd.notnull(df), np.nan)
    for i in df.index:
        try:
            t = pd.DataFrame()
            t = t.append(df.loc[i])
            t.reset_index(drop=True, inplace=True)
            try:
                count = search(t.loc[0]['Title'], t.loc[0]['Site'])
                print(count)
                if count < 25:
                    test = t.loc[0].to_json()
                    send_data(test, t.loc[0]['Site'])
                    print('Data sent')
                else:
                    print('Skipped')
            except:
                test = t.loc[0].to_json()
                send_data(test, t.loc[0]['Site'])

        except Exception as e:
            print(e)
    print('info fetched')