# Cargo datos data = pickle.load(open(this_file, 'rb')) progress_bar = ChargingBar("Evaluating file {} - {}/{}".format( this_filename, this_file_idx + 1, len(test_filepaths)), suffix="%(percent)d%%") if realistic_check: # Separo datos en pedazos sign = [ data['data'][i:i + slice_samples] for i in range(0, len(data['data']), slice_samples) ] progress_bar.max = len(sign) progress_bar.start() for this_slice_idx, this_slice in enumerate(sign): """ if this_slice_idx % (round(len(sign) / 20)) == 0: print("{}/{} slices evaluated.".format(this_slice_idx, len(sign))) """ progress_bar.next() # Si no pongo este if, el ultimo pedazo podria ser de distinto tamaño y fallar if len(this_slice) == slice_samples: # Analisis this_data = torch.Tensor(this_slice) this_data = this_data.view(1, 1,
def scrape(self, term): """ Scrapes metadata of PubMed articles returned by search term query, processes abstracts, and stores relevant articles :param term: PubMed term query """ print( f'Collection: {self._collection.database.name}.{self._collection.name}. Database: PubMed. Term: {term}.' ) base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils' retmax = 10000 no_id = 0 unreadable = 0 abstracts = [] articles = [] page = 0 total = retmax # progress bar bar = ChargingBar('Getting metadata:', max=total, suffix='%(index)d of %(max)d - %(elapsed_td)s') while page < total: # gets and stores to history UIDs of query url = f'{base}/esearch.fcgi?db=pubmed&term={term}&retstart={page}' url += f'&retmax={retmax}&usehistory=y&api_key={PUBMED_API_KEY}' response = requests.get(url) if not response.ok: print( f'\nPubmedScraper could not get UIDs for \'{term}\' on page {page}.' ) continue # gets info for retrieving UIDs from history soup = BeautifulSoup(response.content, 'html.parser') web = soup.webenv.string key = soup.querykey.string total = int(soup.count.string) bar.max = total # gets metadata for articles from UIDs url = f'{base}/efetch.fcgi?db=pubmed&WebEnv={web}' url += f'&query_key={key}&retstart={page}&retmax={retmax}' url += f'&retmode=xml&api_key={PUBMED_API_KEY}' response = requests.get(url) if not response.ok: print( f'\nPubmedScraper could not get metadata for \'{term}\' on page {page}.' ) continue soup = BeautifulSoup(response.content, 'html.parser') entries = soup.find_all('pubmedarticle') for article in entries: # ignore abstract if doi and uid are null doi = self._get_string( article.find('elocationid', eidtype='doi')) uid = self._get_string(article.find('pmid')) if not doi and not uid: no_id += 1 bar.next() continue # store abstract text for use by mat2vec below abstract = self._remove_html(article.abstracttext) # continues if paper does not have abstract if not abstract: unreadable += 1 bar.next() continue # segments abstract by sentence doc = self.nlp(abstract) sentences = [] is_unreadable = False for sent in doc.sents: # processes sentence text using processor from mat2vec try: tokens, materials = self.processor.process(sent.text) except OverflowError: is_unreadable = True break processed_sent = ' '.join(tokens) sentences.append(processed_sent) # if processor (from above) throws an error, skip the paper if is_unreadable: bar.next() unreadable += 1 continue processed_abstract = '\n'.join(sentences) article = { 'doi': doi, 'uid': uid, 'title': self._remove_html(article.articletitle), 'abstract': abstract, 'creators': self._get_authors(article.find_all('author')), 'publication_name': self._remove_html(article.journal.title), 'issn': self._get_string(article.find('issn', issntype='Print')), 'eissn': self._get_string( article.find('issn', issntype='Electronic')), 'publication_date': self._get_date(article.articledate), 'database': 'pubmed', 'processed_abstract': processed_abstract, } articles.append(article) abstracts.append(processed_abstract) bar.next() # classify abstracts if 20000 have been stored if len(abstracts) == 20000: self._store(articles, abstracts) articles = [] abstracts = [] page += retmax bar.finish() # unreadable papers print(f'No DOI/UID: {no_id}') print(f'Unreadable papers: {unreadable}') # classifies and stores metadata if abstracts: self._store(articles, abstracts) print() else: print('No abstracts to classify.\n') return # prints classifier metrics for classifier in self._classifiers: classifier.print_metrics() classifier.reset_metrics() # prints general tag metrics if self._save: print(f'Total articles analyzed: {self._gen_total}.') print( f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.' ) print() self._gen_new = 0 self._gen_total = 0
def scrape_faster(self, query): """ Note: requires institutional access by VPN, or else an error will be thrown Faster implementation of scrape() Scrapes metadata of Elsevier (ScienceDirect) articles returned by query, processes abstracts, and stores relevant articles :param query: Elsevier database query """ print( f'Collection: {self._collection.database.name}.{self._collection.name}. Database: Elsevier. Query: {query}.' ) # create url url = f'https://api.elsevier.com/content/metadata/article?query=KEY({query})&apiKey={ELSEVIER_API_KEY}&httpAccept=application%2Fjson' articles = [] abstracts = [] no_doi = 0 unreadable = 0 item = 0 total = 5000 # progress bar bar = ChargingBar('Getting metadata:', max=total, suffix='%(index)d of %(max)d - %(elapsed_td)s') while item < total: response = requests.get(url) if response.ok: data = json.loads(response.content)['search-results'] records = data['entry'] # updates total to total number of papers in query if item == 0: total = min(5000, int(data['opensearch:totalResults'])) bar.max = total # if there are no results, exit if total == 0: print('Search returned no results.\n') return for record in records: doi = record.get('prism:doi') if not doi: no_doi += 1 bar.next() continue abstract = record.get('prism:teaser') # if there is no abstract, skip this article if not abstract: unreadable += 1 bar.next() continue # segments abstract by sentence doc = self.nlp(abstract) sentences = [] is_unreadable = False # processes sentence text using mat2vec processor for sent in doc.sents: try: tokens, materials = self.processor.process( sent.text) except OverflowError: is_unreadable = True break processed_sent = ' '.join(tokens) sentences.append(processed_sent) # if processor (from above) throws an error, skip the paper if is_unreadable: unreadable += 1 bar.next() continue processed_abstract = '\n'.join(sentences) # create new document and store new article document if not in collection article = { 'doi': doi, 'uid': None, 'title': record.get('dc:title'), 'abstract': abstract, 'url': record.get('prism:url'), 'creators': self._get_creators(data.get('dc:creator')), 'publication_name': data.get('prism:publicationName'), 'issn': record.get('prism:issn'), 'publication_date': self._get_date(data.get('prism:coverDate')), 'database': 'elsevier', 'processed_abstract': processed_abstract } articles.append(article) abstracts.append(processed_abstract) bar.next() # sets url to next page in search url = data['link'][-2]['@href'] # json file has 25 items per page, so go to the next page item += 25 bar.finish() # unreadable papers print(f'Unreadable papers: {unreadable}') # classifies and stores metadata if abstracts: self._store(articles, abstracts) print() else: print('No abstracts to classify.\n') return # prints classifier metrics for classifier in self._classifiers: classifier.print_metrics() classifier.reset_metrics() # prints general tag metrics if self._save: print(f'Total articles analyzed: {self._gen_total}.') print( f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.' ) print() self._gen_new = 0 self._gen_total = 0
def scrape(self, subject='', keyword=''): """ Scrapes metadata of Springer Nature articles returned by subject and keyword query, processes abstracts, and stores relevant articles :param subject: subject constraint query, if empty does not include subject constraint to query :param keyword: keyword constraint query, if empty does not include keyword constraint to query """ # prints subject and query made subject_print = subject if subject else 'None' keyword_print = keyword if keyword else 'None' print( f'Collection: {self._collection.database.name}.{self._collection.name}. Database: Springer Nature. Subject: {subject_print}, Keyword: {keyword_print}.' ) articles = [] abstracts = [] unreadable = 0 no_doi = 0 item = 0 total = 100 # progress bar bar = ChargingBar('Getting metadata:', max=total, suffix='%(index)d of %(max)d - %(elapsed_td)s') while item < total: # builds url and queries API url = self._url_builder(item, subject, keyword) response = requests.get(url) if response.ok: data = json.loads(response.content) records = data['records'] # updates total to total number of papers in query if item == 0: total = int(data['result'][0]['total']) bar.max = total # gets metadata for record in records: # ignore abstract if doi is null doi = record.get('doi') if not doi: no_doi += 1 bar.next() continue # store abstract text for use by mat2vec below abstract = record.get('abstract') # continues if paper does not have abstract if not abstract: unreadable += 1 bar.next() continue # segments abstract by sentence doc = self.nlp(abstract) sentences = [] is_unreadable = False # processes sentence text using mat2vec processor for sent in doc.sents: try: tokens, materials = self.processor.process( sent.text) except OverflowError: is_unreadable = True break processed_sent = ' '.join(tokens) sentences.append(processed_sent) # if processor (from above) throws an error, skip the paper if is_unreadable: unreadable += 1 bar.next() continue processed_abstract = '\n'.join(sentences) # create new document and store new article document if not in collection article = { 'doi': record.get('doi'), 'uid': None, 'title': record.get('title'), 'abstract': abstract, 'url': self._get_url(record.get('url')), 'creators': self._get_creators(record.get('creators')), 'publication_name': record.get('publicationName'), 'issn': record.get('issn'), 'eissn': record.get('eIssn'), 'publication_date': self._get_date(record.get('publicationDate')), 'database': 'springer', 'processed_abstract': processed_abstract } articles.append(article) abstracts.append(processed_abstract) bar.next() # classify abstracts if 20000 have been stored if len(abstracts) == 20000: self._store(articles, abstracts) articles = [] abstracts = [] # 100 items per page, so go to next page item += 100 bar.finish() # unreadable papers print(f'No DOI: {no_doi}') print(f'Unreadable papers: {unreadable}') # classifies and stores metadata if abstracts: self._store(articles, abstracts) print() else: print('No abstracts to classify.\n') return # prints classifier metrics for classifier in self._classifiers: classifier.print_metrics() classifier.reset_metrics() # prints general tag metrics if self._save: print(f'Total articles analyzed: {self._gen_total}.') print( f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.' ) print() self._gen_new = 0 self._gen_total = 0
def scrape(self, query): """ Scrapes metadata of Elsevier (ScienceDirect) articles returned by query, processes abstracts, and stores relevant articles :param query: Elsevier database query """ print( f'Collection: {self._collection.database.name}.{self._collection.name}. Database: Elsevier. Query: {query}.' ) # creates search url url = f'https://api.elsevier.com/content/search/sciencedirect?query={query}&apiKey={ELSEVIER_API_KEY}&httpAccept=application%2Fjson' # gets dois dois = [] item = 0 total = 5000 # progress bar bar = ChargingBar('Getting DOIs:', max=total, suffix='%(index)d of %(max)d - %(elapsed_td)s') while item < total: response = requests.get(url) if response.ok: data = json.loads(response.content)['search-results'] # updates total to total number of papers in query if item == 0: total = min(5000, int(data['opensearch:totalResults'])) bar.max = total # stores dois for entry in data['entry']: doi = entry.get('prism:doi') if doi: dois.append(current_doi) bar.next() # sets url to next page in search url = data['link'][-2]['@href'] # json file has 25 items per page, so go to the next page item += 25 bar.finish() # metadata articles = [] abstracts = [] unreadable = 0 if not dois: print('No abstracts to classify.\n') return # progress bar bar = ChargingBar('Getting metadata:', max=len(dois), suffix='%(index)d of %(max)d - %(elapsed_td)s') for doi in dois: url = f'https://api.elsevier.com/content/article/doi/{doi}?apiKey={ELSEVIER_API_KEY}&httpAccept=application%2Fjson' response = requests.get(url) if response.ok: try: data = json.loads( response.content )['full-text-retrieval-response']['coredata'] except json.decoder.JSONDecodeError: unreadable += 1 bar.next() continue # store abstract text for use by mat2vec below abstract = data.get('dc:description') # continues if paper does not have abstract if not abstract: unreadable += 1 bar.next() continue # segments abstract by sentence doc = self.nlp(abstract) sentences = [] is_unreadable = False # processes sentence text using processor from mat2vec for sent in doc.sents: try: tokens, materials = self.processor.process(sent.text) except OverflowError: is_unreadable = True break processed_sent = ' '.join(tokens) sentences.append(processed_sent) # if processor (from above) throws an error, skip the paper if is_unreadable: bar.next() unreadable += 1 continue processed_abstract = '\n'.join(sentences) article = { 'doi': doi, 'uid': None, 'title': data.get('dc:title'), 'abstract': abstract, 'url': data.get('prism:url'), 'creators': self._get_creators(data.get('dc:creator')), 'publication_name': data.get('prism:publicationName'), 'issn': data.get('prism:issn'), 'publication_date': self._get_date(data.get('prism:coverDate')), 'database': 'elsevier', 'processed_abstract': processed_abstract, } articles.append(article) abstracts.append(processed_abstract) bar.next() bar.finish() # unreadable papers print(f'Unreadable papers: {unreadable}') # classifies and stores metadata if abstracts: self._store(articles, abstracts) print() else: print('No abstracts to classify.\n') return # prints classifier metrics for classifier in self._classifiers: classifier.print_metrics() classifier.reset_metrics() # prints general tag metrics if self._save: print(f'Total articles analyzed: {self._gen_total}.') print( f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.' ) print() self._gen_new = 0 self._gen_total = 0