def validate(path): print("Validating on the basis of data from: "+ path) with open(path) as fp: entries = list(readris(fp)) titles = [entry['primary_title'] if 'primary_title' in entry else "" for entry in entries ] #print(len(titles)) abstracts = [entry['abstract'] if 'abstract' in entry else "" for entry in entries ] #print(len(abstracts)) keywords = [" ".join(entry['keywords']) if 'keywords' in entry else "" for entry in entries ]#joining so that list becomes sentence IDs= [entry['id'] if 'id' in entry else "" for entry in entries ] #print(len(keywords)) Date_Float = [float(entry['publication_year'].replace("//","")) if 'publication_year' in entry else "2020" for entry in entries ] #print(len(Date_Float)) ti_abs_kw = [entry[0] + " " + entry[1] + " " + entry[2] for entry in zip(titles, abstracts, keywords)]#make tiabs as searchable entity #print(keywords[1]) df = pd.DataFrame(list(zip(IDs, titles, Date_Float, abstracts, keywords, ti_abs_kw)), columns =['ID', 'Title', 'Date_Float', 'Abstract', "Keywords", "tiAbsKw"]) print("Validating {} search results...".format(len(titles))) search_df(df, date_max=2020.03, date_min=2005.00, field="tiAbsKw") #validate("C:\\Users\\xf18155\\OneDrive - University of Bristol\\MyFiles-Migrated\\Documents\\SR automation review\\Search\\ris (6).ris")
def test_parse_multiline_ris(self): filepath = os.path.join(CURRENT_DIR, 'multiline.ris') result_entry = self.nice_keys({ 'TY': 'JOUR', 'AU': ['Shannon,Claude E.'], 'PY': '1948/07//', 'TI': 'A Mathematical Theory of Communication', 'JF': 'Bell System Technical Journal', 'N2': 'first line, then second line and at the end the last line', 'N1': ['first line', '* second line', '* last line'], 'SP': '379', 'EP': '423', 'VL': '27', }) with open(filepath, 'r') as f: entries = list(readris(f)) self.compare([result_entry], entries)
def read_ris(fp, labels=None): """RIS file reader. Parameters ---------- fp: str, pathlib.Path File path to the RIS file. label: bool Check for label. If None, this is automatic. Returns ------- list: List with entries. """ # build a map of the tags mapping = TAG_KEY_MAPPING if labels: mapping[RIS_KEY_LABEL_INCLUDED] = NAME_LABEL_INCLUDED with open(fp, 'r') as bibliography_file: entries = list(readris(bibliography_file, mapping=mapping)) return entries
def __init__(self, f): if isinstance(f, str): f = open(f, "r") else: f = f reader = readris(f, mapping=self.get_mapping()) contents = [content for content in reader] f.close() self.raw_references = contents
def __init__(self, fileObj): if isinstance(fileObj, basestring): f = open(fileObj, 'r') else: f = fileObj reader = readris(f, mapping=self.get_mapping()) contents = [content for content in reader] f.close() self.raw_references = contents
def file_readable(cls, fileObj): # ensure that file can be successfully parsed try: reader = readris(fileObj, mapping=cls.get_mapping()) [content for content in reader] fileObj.seek(0) return True except IOError: return False
def file_readable(cls, f): # ensure that file can be successfully parsed try: reader = readris(f, mapping=cls.get_mapping()) [content for content in reader] f.seek(0) return True except IOError as err: logging.warning(err) return False
def test_parse_example_full_ris(self): filepath = os.path.join(CURRENT_DIR, 'example_full.ris') entry1 = { 'TY': 'JOUR', 'ID': '12345', 'T1': 'Title of reference', 'A1': ['Marx, Karl', 'Lindgren, Astrid'], 'A2': ['Glattauer, Daniel'], 'Y1': '2014//', 'N2': 'BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.', 'KW': ['Pippi', 'Nordwind', 'Piraten'], 'JF': 'Lorem', 'JA': 'lorem', 'VL': '9', 'IS': '3', 'SP': 'e0815', 'CY': 'United States', 'PB': 'Fun Factory', 'SN': '1932-6208', 'M1': '1008150341', 'L2': 'http://example.com', 'UR': 'http://example_url.com', } entry2 = { 'TY': 'JOUR', 'ID': '12345', 'T1': 'The title of the reference', 'A1': ['Marxus, Karlus', 'Lindgren, Astrid'], 'A2': ['Glattauer, Daniel'], 'Y1': '2006//', 'N2': 'BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.', 'KW': ['Pippi Langstrumpf', 'Nordwind', 'Piraten'], 'JF': 'Lorem', 'JA': 'lorem', 'VL': '6', 'IS': '3', 'SP': 'e0815341', 'CY': 'Germany', 'PB': 'Dark Factory', 'SN': '1732-4208', 'M1': '1228150341', 'L2': 'http://example2.com', 'UR': 'http://example_url.com', } results = self.nice_list([entry1, entry2]) with open(filepath, 'r') as bibliography_file: entries = list(readris(bibliography_file)) self.compare(results, entries)
def create_llrs(filename): curated_filename = curate_RIS_MDI_file(filename) with open(curated_filename, 'r', encoding="utf-8") as bibliography_file: # using https://pypi.python.org/pypi/RISparser entries = readris(bibliography_file) for entry in entries: llrs.add(create_llr_from_RIS(entry)) print("**********************************") print("new records (Set)= %d" % (len(llrs))) return llrs
def setup(self): catalog_a = self.doc.find('a', href=self.CATALOG) if catalog_a: catalog_num = self.CATALOG.search(catalog_a['href']).group(1) ris_url = 'https://catalog.hathitrust.org/Search/SearchExport?handpicked={}&method=ris'.format( catalog_num) response = requests.get( ris_url, headers={"User-Agent": settings.USER_AGENT}) records = readris(response.text.splitlines() ) if response.status_code == 200 else [] for record in records: self.record = record return self.record = {}
def create_llrs(filename): curated_filename = curate_RIS_MDI_file(filename) llrs = set() with open(curated_filename, 'r') as bibliography_file: # using https://pypi.python.org/pypi/RISparser entries = readris(bibliography_file) for entry in entries: llrs.add(create_llr_from_RIS(entry)) print "**********************************" print "new records (Set)= %d" %(len(llrs)) return llrs
def parse(self, response): """ Parse .ris file and submit article to DynamoDB table. """ filepath = 'Abstracts.ris' with open(filepath, 'r', encoding="utf-8") as bibliography_file: entries = readris(bibliography_file) for entry in entries: #Map enries into article. article = Article() if 'accession_number' in entry: article['id'] = re.sub('[\s+]', '', entry['accession_number']) # Add title if 'translated_title' in entry: article['title'] = entry['translated_title'] else: article['title'] = entry['title'] # Add authors if 'authors' in entry: article['authors'] = entry['authors'] # Add abstract if 'abstract' in entry: article['abstract'] = entry['abstract'] # Add year of publishing if 'year' in entry: article['release_date'] = entry['year'] # Add type of article if 'type_of_reference' in entry: article['article_type'] = entry['type_of_reference'] # Add keywords if 'keywords' in entry: article['keywords'] = entry['keywords'] # Add last time fetched by bot. article['last_update'] = int(time.mktime(self.now.timetuple())) # This line push the item through the pipeline. yield article
def test_parse_example_basic_ris(self): filepath = os.path.join(CURRENT_DIR, 'example_basic.ris') result_entry = self.nice_keys({ 'TY': 'JOUR', 'AU': ['Shannon,Claude E.'], 'PY': '1948/07//', 'TI': 'A Mathematical Theory of Communication', 'JF': 'Bell System Technical Journal', 'SP': '379', 'EP': '423', 'VL': '27', }) with open(filepath, 'r') as bibliography_file: entries = list(readris(bibliography_file)) self.compare([result_entry], entries)
def read_ris(fp): """RIS file reader. Parameters ---------- fp: str, pathlib.Path File path to the RIS file. label: bool Check for label. If None, this is automatic. Returns ------- list: List with entries. """ encodings = ['ISO-8859-1', 'utf-8', 'utf-8-sig'] entries = None for encoding in encodings: try: with open(fp, 'r', encoding=encoding) as bibliography_file: mapping = _tag_key_mapping(reverse=False) entries = list(readris(bibliography_file, mapping=mapping)) break except UnicodeDecodeError: pass except IOError as e: logging.warning(e) if entries is None: raise ValueError("Cannot find proper encoding for data file.") df = pd.DataFrame(entries) def converter(x): try: return ", ".join(x) except TypeError: return "" for tag in LIST_TYPE_TAGS: key = TAG_KEY_MAPPING[tag] if key in df: df[key] = df[key].apply(converter) return standardize_dataframe(df)
def main(filename): with open('cytuj.mustache') as template_file: template = template_file.read() with open(filename) as bib_file: entries = readris(bib_file) for e in entries: print(e) for i, a in enumerate(e['authors']): surname, name = a.split(",", 1) e['name'] = name e['surname'] = surname e['znaczenie'] = e.get('custom1') e['pismo'] = e.get('secondary_name', e.get('journal_name')) title = e.get('title', e.get('primary_title')) e['title'] = title print("{{" + pystache.render(template, e) + "}}")
def read_ris(fp): """RIS file reader. Parameters ---------- fp: str, pathlib.Path File path to the RIS file. label: bool Check for label. If None, this is automatic. Returns ------- list: List with entries. """ encodings = ['ISO-8859-1', 'utf-8', 'utf-8-sig'] entries = None for encoding in encodings: try: with open(fp, 'r', encoding=encoding) as bibliography_file: mapping = _tag_key_mapping(reverse=False) entries = list(readris(bibliography_file, mapping=mapping)) break except (UnicodeDecodeError, IOError): pass if entries is None: raise ValueError("Cannot find proper encoding for data file.") df = pd.DataFrame(entries) if "keywords" in df: def converter(x): try: return ", ".join(x) except TypeError: return "" df["keywords"] = df["keywords"].apply(converter) return standardize_dataframe(df)
def test_parse_multiple_unknown_tags_ris(self): filepath = os.path.join(CURRENT_DIR, 'example_multi_unknown_tags.ris') unknowns = defaultdict(list) unknowns['JP'].append('CRISPR') unknowns['DC'].append('Direct Current') result_entry = self.nice_keys({ 'TY': 'JOUR', 'AU': ['Shannon,Claude E.'], 'PY': '1948/07//', 'TI': 'A Mathematical Theory of Communication', 'JF': 'Bell System Technical Journal', 'SP': '379', 'EP': '423', 'VL': '27', # {'JP': ['CRISPR'], 'DC': ['Direct Current']} 'UK': unknowns, }) with open(filepath, 'r') as bibliography_file: entries = list(readris(bibliography_file)) self.compare([result_entry], entries)
def readRIS(filename): with open(filename, 'r') as f: entries = readris(f) res = [] for entry in entries: entry['author'] = authorListFromListOfAuthors(entry.get('authors', [])) if 'authors' in entry: del entry['authors'] new_type = 'article' if entry.get('type_of_reference'): if entry['type_of_reference'] in reverse_type_mapping: new_type = reverse_type_mapping[entry['type_of_reference']] entry['ENTRYTYPE'] = new_type entry = fixBibData(entry, 0) res.append(entry) return res
def test_parse_example_basic_ris(self): mapping = TAG_KEY_MAPPING filedirpath = os.path.dirname(os.path.realpath(__file__)) filepath = filedirpath + '/example_basic.ris' ristags = [ { 'TY': 'JOUR' }, { 'AU': ['Shannon,Claude E.'] }, { 'PY': '1948/07//' }, { 'TI': 'A Mathematical Theory of Communication' }, { 'JF': 'Bell System Technical Journal' }, { 'SP': '379' }, { 'EP': '423' }, { 'VL': '27' }, ] with open(filepath, 'r') as bibliography_file: entries = list(readris(bibliography_file)) assert len(entries) for ristag in ristags: k, v = ristag.popitem() k = mapping[k] if isinstance(entries[0][k], list): assert ''.join(v) == ''.join(entries[0][k]) else: assert v == entries[0][k].strip()
def read_ris(fp): """RIS file reader. Parameters ---------- fp: str, pathlib.Path File path to the RIS file. label: bool Check for label. If None, this is automatic. Returns ------- list: List with entries. """ with open(fp, 'r') as bibliography_file: mapping = _tag_key_mapping(reverse=False) entries = list(readris(bibliography_file, mapping=mapping)) return entries
def test_parse_example_full_ris(self): mapping = TAG_KEY_MAPPING filedirpath = os.path.dirname(os.path.realpath(__file__)) filepath = filedirpath + '/example_full.ris' ristags = [ {'TY': 'JOUR'}, {'ID': '12345'}, {'T1': 'Title of reference'}, {'A1': ['Marx, Karl', 'Lindgren, Astrid']}, {'A2': ['Glattauer, Daniel']}, {'Y1': '2014//'}, {'N2': 'BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.'}, {'KW': ['Pippi', 'Nordwind', 'Piraten']}, {'JF': 'Lorem'}, {'JA': 'lorem'}, {'VL': '9'}, {'IS': '3'}, {'SP': 'e0815'}, {'CY': 'United States'}, {'PB': 'Fun Factory'}, {'SN': '1932-6208'}, {'M1': '1008150341'}, {'L2': 'http://example.com'}, {'UR': 'http://example_url.com'}, ] with open(filepath, 'r') as bibliography_file: entries = list(readris(bibliography_file)) assert len(entries) == 2 for ristag in ristags: k, v = ristag.popitem() k = mapping[k] assert k in entries[0] if isinstance(entries[0][k], types.ListType): assert ''.join(v) == ''.join(entries[0][k]) else: assert v == entries[0][k].strip()
def test_parse_example_basic_ris(self): mapping = TAG_KEY_MAPPING filedirpath = os.path.dirname(os.path.realpath(__file__)) filepath = filedirpath + '/example_basic.ris' ristags = [ {'TY': 'JOUR'}, {'AU': ['Shannon,Claude E.']}, {'PY': '1948/07//'}, {'TI': 'A Mathematical Theory of Communication'}, {'JF': 'Bell System Technical Journal'}, {'SP': '379'}, {'EP': '423'}, {'VL': '27'}, ] with open(filepath, 'r') as bibliography_file: entries = list(readris(bibliography_file)) assert len(entries) for ristag in ristags: k, v = ristag.popitem() k = mapping[k] if isinstance(entries[0][k], types.ListType): assert ''.join(v) == ''.join(entries[0][k]) else: assert v == entries[0][k].strip()
def convert(): parser = BibTexParser(common_strings=False) parser.ignore_nonstandard_types=False parser.homogenize_fields= False if request.method== 'POST': file = request.files['filer_input'] if file.filename == '': flash('no selected file') return redirect(request.url) if file and allowed_file(file.filename): if file and file_bib(file.filename): bibtex_file = file.stream bibtex_str = bibtex_file.read() bib_database = bibtexparser.loads(bibtex_str, parser) df = pd.DataFrame(bib_database.entries) df.index += 1 return render_template_string(html, filenm = file.filename, table = df.to_html(index='Nomor', header='true', table_id='example', classes='table table-striped table-bordered')) elif file and file_ris(file.filename): ris_file = file.stream.read() reader = io.BytesIO(ris_file) wrapper = io.TextIOWrapper(reader, encoding='utf-8') entries = readris(wrapper) df = pd.DataFrame(entries) df['publication_year'] = df['publication_year'].apply(remove_punch) df.index += 1 return render_template_string(html, filenm = file.filename, table = df.to_html(header='true', table_id='example', classes='table table-striped table-bordered')) else: data = xmltodict.parse(file) json_data = json.dumps(data) wanda = pd.read_json(StringIO(json_data)) vision = wanda["xml"]['records']['record'] axel = pd.DataFrame(vision) if axel.index[0] == '@name': if axel['database'] is not None: del axel['database'] # axel.index +=1 else: None else: if axel['database'] is not None: del axel['database'] else: None #for get type value type_data = [] for i, each in enumerate(axel['ref-type']): if '@name' in axel['ref-type'][i].keys(): try: data_type = axel['ref-type'][i]['@name'] type_data.append(data_type) except Exception as e: print(e) axel['ref-type'] = type_data #for get author value authors = [] for i, each in enumerate(axel['contributors']): if 'authors' in axel['contributors'][i].keys(): try: data_author = axel['contributors'][i]['authors']['author'] authors.append(data_author) except Exception as e: print(e) axel['contributors'] = authors # for get title titles=[] for i, each in enumerate(axel['titles']): title_data = axel['titles'][i].values() titles.append(title_data) axel['titles']= titles # for get full-tittle periodical=[] for i, each in enumerate(axel['periodical']): hulk = axel['periodical'][i] if hulk is not None: # else hulk = 'NaN' periodical_data = hulk.values() periodical.append(periodical_data) axel['periodical']= periodical # FOR GET keywords # keyw=[] # for i, each in enumerate(axel['keywords']): # hawk_eye = axel['keywords'][i] # if hawk_eye is not None: # else hulk = 'NaN' # keyw_data = hawk_eye.values() # keyw.append(keyw_data) # axel['keywords']= keyw axel.index += 1 return render_template_string(html, filenm = file.filename, table = axel.to_html(header='true', table_id='example', classes='table table-striped table-bordered')) else: return render_template('output1.html') return render_template('index.html')
from RISparser import readris, config path_old = 'databases2015.ris' path_new = 'databases2017.ris' overlap_entries = 0 new_entries = [] with open(path_old, 'r') as bibfile_old, open(path_new, 'r') as bibfile_new: entries_old = list(readris(bibfile_old)) entries_new = list(readris(bibfile_new)) print('Entries in old file: %d' % len(entries_old)) print('Entries in new file: %d' % len(entries_new)) for entry in entries_new: if len([x for x in entries_old if x == entry]): overlap_entries = overlap_entries + 1 else: new_entries.append(entry) print('Overlapping entries: %d' % overlap_entries) print('New entries: %d' % len(new_entries)) tag_key_mapping = dict( zip(config.TAG_KEY_MAPPING.values(), config.TAG_KEY_MAPPING.keys())) with open('new_entries.ris', 'w') as out: for entry in new_entries[1:5]: for key, value in entry.iteritems(): if key in tag_key_mapping:
from sets import Set path_old = 'databases2015.ris' path_new = 'databases2017.ris' overlap_entries = 0 new_entries = [] class hashabledict(dict): def __hash__(self): return hash(frozenset(self)) with open(path_old, 'r') as bibfile_old, open(path_new, 'r') as bibfile_new: entries_old = Set([hashabledict(x) for x in readris(bibfile_old)]) entries_new = [hashabledict(x) for x in list(readris(bibfile_new))] print('Entries in old file: %d' % len(entries_old)) print('Entries in new file: %d' % len(entries_new)) for entry in entries_new: if entry in entries_old: overlap_entries = overlap_entries + 1 else: new_entries.append(entry) print('Overlapping entries: %d' % overlap_entries) print('New entries: %d' % len(new_entries)) tag_key_mapping = dict(
def import_naive_results(self, path, save_dataset=False, save_directory=None, clean_dataset=False): """ This method imports the search results from a specified path :param clean_dataset: if TRUE, de-duplicates search results after importing :param save_dataset: if TRUE, saves the full search results to a .csv :param save_directory: the path to a directory where search results will be saved if save_dataset is set to TRUE :param path: path containing the naive search results files :return: a pandas data frame consisting of assembled search results """ wos_li = [] scopus_li = [] jstor_li = [] # import all files in the directory, automatically detect on file name extension [supports WOS and Scopus] all_files = glob.glob(path + "*") for filename in all_files: if filename.endswith('.txt'): df = pd.read_csv(filename, index_col=False, delimiter="\t", quotechar=None, quoting=3, encoding='utf-8') wos_li.append(df) elif filename.endswith('.csv'): df = pd.read_csv(filename) scopus_li.append(df) elif filename.endswith('.ris'): with open(filename, 'r', encoding='utf-8') as f: lines = f.readlines() with open(filename, "w", encoding='utf-8') as f: strings = ("Provider", "Database", "Content") for line in lines: if not any(s in line for s in strings): f.write(line) with open(filename, 'r', encoding='utf-8') as bibliography_file: entries = list(readris(bibliography_file)) for entry in entries: jstor_li.append(entry) if len( wos_li ) != 0: # check to see if any web of science results was imported # concatenate files from same database into DataFrame wos_dataset = pd.concat(wos_li, axis=0, ignore_index=True) # merge article titles and abstract to form text from which keywords will be extracted text = list(wos_dataset[['AB', 'TI']].apply( lambda x: '{}{}'.format(x[0], x[1]), axis=1)) wos_text = [self.remove_punctuations(t) for t in text] self.search_results['id'] = list(wos_dataset['UT']) self.search_results['text'] = wos_text self.search_results['title'] = list(wos_dataset['TI']) self.search_results['abstract'] = list(wos_dataset['AB']) self.search_results['keywords'] = list(wos_dataset['DE']) self.search_results['type'] = list(wos_dataset['DT']) self.search_results['authors'] = list(wos_dataset['AU']) self.search_results['affiliation'] = list(wos_dataset['C1']) self.search_results['source'] = list(wos_dataset['SO']) self.search_results['year'] = list(wos_dataset['PY']) self.search_results['volume'] = list(wos_dataset['VL']) self.search_results['issue'] = list(wos_dataset['IS']) self.search_results['startpage'] = list(wos_dataset['BP']) self.search_results['endpage'] = list(wos_dataset['EP']) self.search_results['doi'] = list(wos_dataset['DI']) self.search_results['language'] = list(wos_dataset['LA']) self.search_results['database'] = (['WOS'] * len(wos_dataset.index)) if len(scopus_li ) != 0: # check to see if any scopus results was imported # concatenate files from same database into DataFrame scopus_dataset = pd.concat(scopus_li, axis=0, ignore_index=True) text = list(scopus_dataset[['Abstract', 'Title']].apply( lambda x: '{}{}'.format(x[0], x[1]), axis=1)) scopus_text = [self.remove_punctuations(t) for t in text] self.search_results['id'] = self.search_results['id'] + list( scopus_dataset['EID']) self.search_results[ 'text'] = self.search_results['text'] + scopus_text self.search_results['title'] = self.search_results['title'] + list( scopus_dataset['Title']) self.search_results['abstract'] = self.search_results[ 'abstract'] + list(scopus_dataset['Abstract']) self.search_results['keywords'] = self.search_results[ 'keywords'] + list(scopus_dataset['Author Keywords']) self.search_results['type'] = self.search_results['type'] + list( scopus_dataset['Document Type']) self.search_results['authors'] = self.search_results[ 'authors'] + list(scopus_dataset['Authors']) self.search_results['affiliation'] = self.search_results[ 'affiliation'] + list(scopus_dataset['Affiliations']) self.search_results['source'] = self.search_results[ 'source'] + list(scopus_dataset['Source']) self.search_results['year'] = self.search_results['year'] + list( scopus_dataset['Year']) self.search_results['volume'] = self.search_results[ 'volume'] + list(scopus_dataset['Volume']) self.search_results['issue'] = self.search_results['issue'] + list( scopus_dataset['Issue']) self.search_results['startpage'] = self.search_results[ 'startpage'] + list(scopus_dataset['Page start']) self.search_results['endpage'] = self.search_results[ 'endpage'] + list(scopus_dataset['Page end']) self.search_results['doi'] = self.search_results['doi'] + list( scopus_dataset['DOI']) self.search_results['language'] = self.search_results[ 'language'] + (['English'] * len(scopus_dataset.index)) self.search_results['database'] = self.search_results[ 'database'] + list(scopus_dataset['Source']) if len(jstor_li ) != 0: # check to see if any jstor results was imported # concatenate files from same database into DataFrame jstor_dataset = pd.DataFrame(jstor_li) i = 0 for a, b in zip(jstor_dataset.abstract, jstor_dataset.authors): jstor_dataset.at[i, 'abstract'] = str(a).translate( {ord(c): '' for c in self.bad_chars}) jstor_dataset.at[i, 'authors'] = str(b).translate( {ord(c): '' for c in self.bad_chars}) i = i + 1 text = list(jstor_dataset[['abstract', 'title']].apply( lambda x: '{}{}'.format(x[0], x[1]), axis=1)) jstor_text = [self.remove_punctuations(t) for t in text] self.search_results['id'] = self.search_results['id'] + list( jstor_dataset['issn']) self.search_results[ 'text'] = self.search_results['text'] + jstor_text self.search_results['title'] = self.search_results['title'] + list( jstor_dataset['title']) self.search_results['abstract'] = self.search_results[ 'abstract'] + list(jstor_dataset['abstract']) self.search_results['keywords'] = self.search_results[ 'keywords'] + (['NaN'] * len(jstor_dataset.index)) self.search_results['type'] = self.search_results['type'] + list( jstor_dataset['type_of_reference']) self.search_results['authors'] = self.search_results[ 'authors'] + list(jstor_dataset['type_of_reference']) self.search_results['affiliation'] = self.search_results[ 'affiliation'] + (['NaN'] * len(jstor_dataset.index)) self.search_results['source'] = self.search_results[ 'source'] + list(jstor_dataset['name_of_database']) self.search_results['year'] = self.search_results['year'] + list( jstor_dataset['year']) self.search_results['volume'] = self.search_results[ 'volume'] + list(jstor_dataset['volume']) self.search_results['issue'] = self.search_results['issue'] + list( jstor_dataset['number']) self.search_results['startpage'] = self.search_results[ 'startpage'] + list(jstor_dataset['start_page']) self.search_results['endpage'] = self.search_results[ 'endpage'] + list(jstor_dataset['end_page']) self.search_results['doi'] = self.search_results['doi'] + list( jstor_dataset['doi']) self.search_results['language'] = self.search_results[ 'language'] + (['English'] * len(jstor_dataset.index)) self.search_results['database'] = self.search_results[ 'database'] + list(jstor_dataset['name_of_database']) # merge all search database results into a dataframe data_frame = pd.DataFrame.from_dict(self.search_results) if save_dataset is True: if save_directory is not None: data_frame.to_csv(r"{}".format(save_directory + 'imported_results.csv'), index=False) else: data_frame.to_csv('data/IMPORTED/imported_results.csv', index=False) if clean_dataset is True: data_frame = self.deduplicate_dataframe(data_frame, self.columns) return data_frame
def test_starting_newline(self): fn = os.path.join(CURRENT_DIR, 'example_starting_newlines.ris') with open(fn, 'r') as f: entries = list(readris(f)) assert len(entries) == 1
def test_parse_wos_ris(self): fn = os.path.join(CURRENT_DIR, 'example_wos.ris') with open(fn, 'r') as f: entries = list(readris(f, wok=True)) assert len(entries) == 2
def load_entries_from_ris(ris_fpath): ris_fpath = fnmstr(ris_fpath) with open(ris_fpath, "r") as read: entries = readris(read) return entries
def test_parse_example_full_ris(self): mapping = TAG_KEY_MAPPING filedirpath = os.path.dirname(os.path.realpath(__file__)) filepath = filedirpath + '/example_full.ris' ristags = [ { 'TY': 'JOUR' }, { 'ID': '12345' }, { 'T1': 'Title of reference' }, { 'A1': ['Marx, Karl', 'Lindgren, Astrid'] }, { 'A2': ['Glattauer, Daniel'] }, { 'Y1': '2014//' }, { 'N2': 'BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.' }, { 'KW': ['Pippi', 'Nordwind', 'Piraten'] }, { 'JF': 'Lorem' }, { 'JA': 'lorem' }, { 'VL': '9' }, { 'IS': '3' }, { 'SP': 'e0815' }, { 'CY': 'United States' }, { 'PB': 'Fun Factory' }, { 'SN': '1932-6208' }, { 'M1': '1008150341' }, { 'L2': 'http://example.com' }, { 'UR': 'http://example_url.com' }, ] with open(filepath, 'r') as bibliography_file: entries = list(readris(bibliography_file)) assert len(entries) == 2 for ristag in ristags: k, v = ristag.popitem() k = mapping[k] assert k in entries[0] if isinstance(entries[0][k], list): assert ''.join(v) == ''.join(entries[0][k]) else: assert v == entries[0][k].strip()
def read_ris(q, update): r_count = 0 changed = False encoding = None with open("{}/{}".format(settings.MEDIA_ROOT, q.query_file.name), 'r') as f: with open("{}/{}_tmp".format(settings.MEDIA_ROOT, q.query_file.name), "w") as ftmp: for l in f: if "\\ufeff" in repr(l) or "\ufeff" in repr(l): #encoding = 'utf-8-sig' changed = True ftmp.write(l.replace('\\ufeff', '').replace('\ufeff', '')) elif l == "EF": changed = True elif "Link to the Ovid Full Text or citation:" in l: changed = True elif re.compile('^[A-Z][A-Z0-9] -\n').match(l): changed = True ftmp.write(l.replace('-\n', '- \n')) else: ftmp.write(l) if changed: print("opening edited file") fpath = "{}/{}_tmp".format(settings.MEDIA_ROOT, q.query_file.name) else: fpath = "{}/{}".format(settings.MEDIA_ROOT, q.query_file.name) with open(fpath, "r", encoding=encoding) as f: entries = readris(f, mapping=RIS_KEY_MAPPING) try: for e in entries: if "py" in e: if type(e["py"] is str): e["py"] = int(e["py"][:4]) if "unknown_tag" in e: del e["unknown_tag"] try: add_scopus_doc(e, q, update) r_count += 1 except: print(f"couldn't add {e}") return e except: r_count = 0 with open(fpath, 'r', encoding='utf-8-sig') as f: entries = readris(f, mapping=RIS_KEY_MAPPING) for e in entries: if "py" in e: if type(e["py"] is str): e["py"] = int(e["py"][:4]) if "tc" in e: if type(e["tc"] is str): digits = re.findall(r"\d+", e["tc"]) if len(digits) > 0: e["tc"] = int(digits[0]) else: e["tc"] = None if "unknown_tag" in e: del e["unknown_tag"] try: add_scopus_doc(e, q, update) r_count += 1 except: print(f"couldn't add {e}") break return r_count