예제 #1
0
def pub_data():
    f = open('pubmedcat.txt','r')
    f = f.read().split('\n')
    s=[]
    
    for j in f:
        results = search_query(j)
        id_list = results['IdList']
        papers = fetch_details(id_list)
        
        for i in papers['PubmedArticle']:
            s.append(json.dumps(i, indent=2, separators=(',', ':')))
    
    
    
    author=[]
    title = []
    date = []
    types = []
    source = []
    site = []
    url = []
    ref = []
    pdf_url = []
    abstract = []
    
    for le in range(len(s)):
        data =json.loads(s[le])['MedlineCitation']['Article']
        try:
            url.append('https://pubmed.ncbi.nlm.nih.gov/'+json.loads(s[le])['MedlineCitation']['PMID'])
        except:
            url.append(None)
        
        try:
            abstract.append(data['Abstract']['AbstractText'][0])
        except:
            abstract.append(None)
        
        try:
            pdf_url.append('http://doi.org/'+data['ELocationID'][0])
        except:
            pdf_url.append(None)
        
        try:
            site.append('pubmed')
        except:
            site.append(None)
        
        try:
            issn = 'ISSN: '+ data['Journal']['ISSN']
            tit = data['Journal']['Title']
            vol = 'volume'+' '+ data['Journal']['JournalIssue']['Volume']
            yr = data['Journal']['JournalIssue']['PubDate']['Year']
            
            ref.append(tit+'('+issn+')'+','+vol+'('+yr+')')
        except:
            ref.append(None)
        
        try:
            source.append(data['Journal']['Title'])
        except:
            source.append(None)
        
        try:
            types.append('academic')
        except:
            types.append(None)
        
        try:
            d = json.loads(s[le])['MedlineCitation']['DateCompleted']['Year']+'-'+json.loads(s[le])['MedlineCitation']['DateCompleted']['Month']+'-'+json.loads(s[le])['MedlineCitation']['DateCompleted']['Day']
            date.append(d)
        except:
            date.append(None)
        
        try:
            title.append(data['ArticleTitle'])
        except:
            title.append(None)
        
        try:
            aut = data['AuthorList']
            
            
            a=''
            for i in aut:
                a = a+(i['ForeName']+' '+i['LastName'] )
                a=a+', '
            
            author.append(a[:-2])
        except:
            author.append(None)
            
    
    df = pd.DataFrame({'Authors':author,'Title':title,'Date':date, 'Types':types, 'Source':source, 'Site':site, 'Url':url, 'Ref':ref, 'Pdf_url':pdf_url, 'Abstract':abstract})
    df = df.where(pd.notnull(df), np.nan)
    for i in df.index:
        try:
            t = pd.DataFrame()
            t =t.append(df.loc[i])
            t.reset_index(drop=True, inplace=True)
            try:
                count = search(t.loc[0]['Title'],t.loc[0]['Site'])
                print(count)
                if count < 25 :
                    test =t.loc[0].to_json()
                    send_data(test,t.loc[0]['Site'])
                    print('Data sent')
                else:
                    print('Skipped')
            except:
                test =t.loc[0].to_json()
                send_data(test,t.loc[0]['Site'])
                
        except Exception as e:
            print(e)
    print('info fetched')
예제 #2
0
def grab_data():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links_unread = soup.find_all(
        'div', {'class': 'post-block post-block--image post-block--unread'})
    links_read = soup.find_all(
        'div', {'class': 'post-block post-block--image post-block--read'})

    var = []
    for i in links_unread:
        var.append(i)

    if len(links_read) != 0:
        for i in links_read:
            var.append(i)

    headline = []
    timestamp = []
    AUTHORS = []
    SUMMARY = []
    date_crawled = []
    news_source = []
    full = []

    for i in range(len(var)):
        title = var[i].find('a', {'class': 'post-block__title__link'}).text
        title = title.replace('\n', '')
        title = title.replace('\t', '')
        headline.append(title)

        time = var[i].find('time', {'class': 'river-byline__time'}).text
        time = time.replace('\n', '')
        time = time.replace('\t', '')
        timestamp.append(time)

        author = var[i].find('span', {'class': 'river-byline__authors'})
        author = author.find_all('a')

        string = ''
        for j in author:
            string = string + j.text
            if len(author) > 1:
                string = string + ', '

        if len(author) > 1:
            string = string[:-2]

        author = string

        author = author.replace('\n', '')
        author = author.replace('\t', '')
        AUTHORS.append(author)

        summary = var[i].find('div', {'class': 'post-block__content'}).text
        summary = summary.replace('\n', '')
        summary = summary.replace('\t', '')
        SUMMARY.append(summary)

        date_craw = str(datetime.datetime.today().date())
        date_crawled.append(date_craw)

        source = 'https://techcrunch.com/'
        news_source.append(source)

        full_article_url = var[i].find(
            'a', {'class': 'post-block__title__link'})['href']

        data = requests.get(full_article_url)
        soup = BeautifulSoup(data.text, 'html.parser')

        result = soup.find('div', {'class': 'article-content'})

        full_text = result.text

        full.append(full_text)

    final = pd.DataFrame({
        'Title': headline,
        'Author': AUTHORS,
        'Summary': SUMMARY,
        'full_text': full,
        'date_published': timestamp,
        'date_crawled': date_crawled,
        'news_source': news_source
    })

    for i in final.index:
        try:
            t = pd.DataFrame()
            t = t.append(final.loc[i])
            t.reset_index(drop=True, inplace=True)
            try:
                count = search(t.loc[0]['Title'])
                print(count)
                if count < 25:
                    test = t.loc[0].to_json()
                    send_data(test)
                    print('Data sent')
                else:
                    print('Skipped')
            except:
                test = t.loc[0].to_json()
                send_data(test)

        except Exception as e:
            print(e)
    start()
예제 #3
0
def grab_data():
    asli = []
    
    for j in websites:
        response = requests.get(j)
            
        soup = BeautifulSoup(response.text,'html.parser')
        
        url = soup.find_all('a')
        
        
        for i in range(len(url)) :
            try:
                url[i] = url[i]['href']
            except:
                try:
                    url.remove(url[i])
                except:
                    pass
        
        var=[]
        for i in url:
           if i not in var:
               var.append(i)
    
        
        url = var
       
    
        try:
            f  = open('urlparsed.txt','r')
            already_parsed = f.read().split('\n')
            f.close()
        except:
            f = open('urlparsed.txt','w')
            for i in url:
                try:
                    i =i['href']
                except:
                    pass
                f.write(str(i))
                f.write('\n')
            f.close()
           
        try:
            for i in already_parsed:
                try:
                    url.remove(i)
                except:
                    pass
            
            for i in url:
                already_parsed.append(i)
            
            f = open('urlparsed.txt','w')
            for i in already_parsed:
                    try:
                        i =i['href']
                    except:
                        pass
                    f.write(str(i))
                    f.write('\n')
            f.close()
        except:
            pass
            
        for i in url:    
            try :
                try:
                    i =i['href']
                except:
                    pass
                
                
                if 'https' not in i:
                    if 'http' not in i:
                        i = j+i
                
                #print('\n',i)
                    
                response = requests.get(i,timeout=10)
                details = newspaper(i)
                count = len(details.article)
                publish_date = details.date_publish
                cr_date = details.date_download
                description = details.description
                summary = details.summary
                category = details.category
                if count > 1500:
                    if len(description) > 10 or len(summary) > 10:
                
                        #print("Appended")
                        asli.append(i)
                        
                    else:
                        pass
                else:
                    pass
            except:
                pass
    
    
    
    
    headline=[]
    timestamp=[]
    AUTHORS =[]
    SUMMARY=[]
    date_crawled = []
    news_source = [] 
    full = []
    img_url = []
    keywords=[]
    url_news=[]
    types = []
    for i in asli:
        try:
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            driver = webdriver.Chrome('/usr/bin/chromedriver',chrome_options = chrome_options)
            driver.get(i)
            details = newspaper(i)
            
            if 'bbc' in i:
                news_source.append('bbc')
            elif 'techcrunch' in i:
                news_source.append('techcrunch')
            elif 'theguardian' in i:
                news_source.append('theguardian')
            elif 'voanews' in i:
                news_source.append('voanews')
            elif 'abc.net' in i:
                news_source.append('abc')
                
            headline.append(details.headline)
            timestamp.append(details.date_publish)
            url_news.append(i)
            types.append('newspaper')
            author=''
            for i in details.authors:
                author = author + i
                author =author + ', '
                
            
            author= author[:-2]
            AUTHORS.append(author)
            
            keyword=''
            for i in details.keywords:
                keyword = keyword + i
                keyword =keyword + ', '
                
            
            keyword= keyword[:-2]
            keywords.append(keyword)
            
            if len(details.summary) > 10:
                SUMMARY.append(details.summary)
            else:
                SUMMARY.append(details.description)
            
            date_crawled.append(details.date_download)
            
            
            full.append(details.article)
            try:
               re = driver.find_elements_by_tag_name('img')
               for i in re:
                    if'.jpg' in i.get_attribute('src'):
                       im = i.get_attribute('src')
                       break;
               if len(im)>3:
                    img_url.append(im)
               else:
                   img_url.append(None)
            except:
                   img_url.append(None)
           # print('Done inside')
            
            driver.close()
        except:
            try:
                driver.close()
            except:
                pass
            pass
    
    final = pd.DataFrame({'Title':headline,'Author':AUTHORS,'Summary':SUMMARY,
                              'full_text':full,'date_published':timestamp, 'date_crawled':date_crawled,
                              'news_source':news_source,'img':img_url,'keywords':keywords,'url_news':url_news,'Types':types})
        
    for i in final.index:
        try:
            t = pd.DataFrame()
            t =t.append(final.loc[i])
            t.reset_index(drop=True, inplace=True)
            try:
                count = search(t.loc[0]['Title'],t.loc[0]['news_source'])
                #print(count)
                if count < 25 or count==None :
                    test =t.loc[0].to_json()
                    send_data(test,t.loc[0]['news_source'])
                    #print('Data sent')
                else:
                    pass
                    #print('Skipped')
            except:
                test =t.loc[0].to_json()
                send_data(test,t.loc[0]['news_source'])
                
        except Exception as e:
            pass
예제 #4
0
def grab(test, count):
    headline = []
    timestamp = []
    AUTHORS = []
    SUMMARY = []
    date_crawled = []
    news_source = []
    full = []

    for k in range(0, len(test)):
        try:
            driver = webdriver.Chrome(
                '/home/priyanshu/project 10/chromedriver')
            driver.get(test[k])
            try:
                title = driver.find_element_by_tag_name('h1').text

                author = driver.find_elements_by_class_name('tone-colour')

                a = ''
                for i in author:
                    if len(author) < 2:
                        a = a + i.text
                    else:
                        a = a + i.text
                        a = a + ', '

                if len(author) < 2:
                    a = a[:-2]

                author = a

                time = driver.find_element_by_class_name(
                    'content__dateline').text.split('\n')[0]

                try:
                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    content = soup.find(
                        'div', {
                            'class':
                            'content__article-body from-content-api js-article__body'
                        }).text

                    replace = soup.find(
                        'div', {
                            'class': 'after-article js-after-article'
                        }).next.next.text

                    replacement = soup.find_all('aside')
                    for i in replacement:
                        content = content.replace(i.text, '')

                    content = content.replace(
                        soup.find('div', {
                            'class': 'submeta'
                        }).text, '')

                    content = content.replace(replace, '')
                    summary = soup.find('div', {
                        'class': 'content__standfirst'
                    }).text
                    if len(summary) > 300:
                        summary = summary[:300]
                except:
                    summary = soup.find('div', {
                        'class': 'content__standfirst'
                    }).text
                    content = summary
                    if len(summary) > 300:
                        summary = summary[:300]

                headline.append(title)
                AUTHORS.append(author)
                timestamp.append(time)
                SUMMARY.append(summary)
                full.append(content)

                date = str(datetime.datetime.today().date())
                date_crawled.append(date)

                source = 'https://www.theguardian.com/'
                news_source.append(source)
                driver.close()
            except:
                try:

                    title = driver.find_element_by_tag_name('h1').text

                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    author = soup.find_all('address',
                                           {'aria-label': 'Contributor info'})

                    a = ''
                    for i in author:
                        if len(author) < 2:
                            a = a + i.text
                        else:
                            a = a + i.text
                            a = a + ', '

                    if len(author) < 2:
                        a = a[:-2]

                    author = a

                    time = soup.find('div', {
                        'class': 'css-1kkxezg'
                    }).text.replace(
                        soup.find('span', {
                            'class': 'css-nyo8hb'
                        }).text, '')

                    try:
                        soup = BeautifulSoup(driver.page_source, 'html.parser')
                        content = soup.find('div', {
                            'class':
                            'article-body-commercial-selector css-79elbk'
                        }).text

                        replace = soup.find('section', {
                            'class': 'css-q5digb'
                        }).text

                        replacement = soup.find_all('div',
                                                    {'class': 'css-wz7t6r'})
                        for i in replacement:
                            content = content.replace(i.text, '')

                        content = content.replace(
                            soup.find('div', {
                                'class': 'css-739uag'
                            }).text, '')

                        content = content.replace(replace, '')
                        summary = soup.find('div', {
                            'class': 'css-12nmdsr'
                        }).text
                        if len(summary) > 300:
                            summary = summary[:300]
                    except:
                        summary = soup.find(
                            'div', {
                                'class': 'content__standfirstcss-12nmdsr'
                            }).text
                        content = summary
                        if len(summary) > 300:
                            summary = summary[:300]

                    if title not in headline:
                        headline.append(title)
                    AUTHORS.append(author)
                    timestamp.append(time)
                    SUMMARY.append(summary)
                    full.append(content)

                    date = str(datetime.datetime.today().date())
                    date_crawled.append(date)

                    source = 'https://www.theguardian.com/'
                    news_source.append(source)
                    driver.close()
                except:
                    print('Passed: ', test[k])
                    driver.close()

        except Exception as err:
            print(err)
            driver.close()
            pass

    final = pd.DataFrame({
        'Title': headline,
        'Author': AUTHORS,
        'Summary': SUMMARY,
        'full_text': full,
        'date_published': timestamp,
        'date_crawled': date_crawled,
        'news_source': news_source
    })

    for i in final.index:
        try:
            t = pd.DataFrame()
            t = t.append(final.loc[i])
            t.reset_index(drop=True, inplace=True)
            try:
                count = search(t.loc[0]['Title'])
                print(count)
                if count < 25:
                    test = t.loc[0].to_json()
                    send_data(test)
                    print('Data sent')
                else:
                    print('Skipped')
            except:
                test = t.loc[0].to_json()
                send_data(test)
                print('Data sent')
        except Exception as e:
            print(e)
예제 #5
0
def axir_data():
    f = open('categories.txt', 'r')
    f = f.read().split('\n')

    for i in range(0, len(f)):
        f[i] = f[i].split('\t')
        for j in range(f[i].count('')):
            try:
                f[i].remove('')
            except:
                pass
    #print (f)
    output = []
    for i in f:
        print(i[1])
        scraper = arxivscraper.Scraper(
            category=i[1],
            date_from=str(
                (datetime.datetime.now() - datetime.timedelta(1)).date()),
            date_until=str(datetime.datetime.now().date()))
        output.append(scraper.scrape())

    cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created',
            'updated', 'authors')
    df = pd.DataFrame([], columns=cols)
    for i in output:
        try:
            df = df.append(pd.DataFrame(i, columns=cols))
        except:
            pass
    df.reset_index(drop=True, inplace=True)

    df = df.rename(columns={'abstract': 'Abstract'})
    df = df.rename(columns={'created': 'Date'})
    df = df.rename(columns={'title': 'Title'})
    df['Types'] = 'academic'
    df['Site'] = 'arxiv'
    df['Source'] = None

    for i in range(len(df.authors)):
        a = ''
        for j in df.authors[i]:
            a = a + j
            a = a + ', '
        a = a[:-2]
        df.authors[i] = a
    df = df.rename(columns={'authors': 'Authors'})

    var = []
    for i in range(len(df.id)):
        u = 'https://arxiv.org/abs/'
        var.append(u + df.id[i])

    for i in range(len(df.doi)):
        df.doi[i] = 'http://doi.org/' + df.doi[i]

    df = df.rename(columns={'doi': 'Ref'})

    df['Url'] = var

    var = []
    for i in range(len(df.id)):
        u = 'https://arxiv.org/pdf/'
        var.append(u + df.id[i])

    df['Pdf_url'] = var
    #print (df)

    df = df.where(pd.notnull(df), np.nan)
    for i in df.index:
        try:
            t = pd.DataFrame()
            t = t.append(df.loc[i])
            t.reset_index(drop=True, inplace=True)
            try:
                count = search(t.loc[0]['Title'], t.loc[0]['Site'])
                print(count)
                if count < 25:
                    test = t.loc[0].to_json()
                    send_data(test, t.loc[0]['Site'])
                    print('Data sent')
                else:
                    print('Skipped')
            except:
                test = t.loc[0].to_json()
                send_data(test, t.loc[0]['Site'])

        except Exception as e:
            print(e)
    print('info fetched')