def validate(path):
    print("Validating on the basis of data from: "+ path)
    with open(path) as fp:
        entries = list(readris(fp))
        
        
    
    titles = [entry['primary_title'] if 'primary_title' in entry else "" for entry in entries ]    
    #print(len(titles))
    
    abstracts = [entry['abstract'] if 'abstract' in entry else "" for entry in entries ]    
    #print(len(abstracts))
    
    keywords = [" ".join(entry['keywords']) if 'keywords' in entry else "" for entry in entries ]#joining so that list becomes sentence    
    
    IDs= [entry['id'] if 'id' in entry else "" for entry in entries ]
    #print(len(keywords))
    
    Date_Float = [float(entry['publication_year'].replace("//","")) if 'publication_year' in entry else "2020" for entry in entries ]
    #print(len(Date_Float))
    
    ti_abs_kw = [entry[0] + " " + entry[1] + " " + entry[2] for entry in zip(titles, abstracts, keywords)]#make tiabs as searchable entity
    
    #print(keywords[1])
    df = pd.DataFrame(list(zip(IDs, titles, Date_Float, abstracts, keywords, ti_abs_kw)), 
                   columns =['ID', 'Title', 'Date_Float', 'Abstract', "Keywords", "tiAbsKw"]) 
    
    print("Validating {} search results...".format(len(titles)))
    search_df(df, date_max=2020.03, date_min=2005.00, field="tiAbsKw")



#validate("C:\\Users\\xf18155\\OneDrive - University of Bristol\\MyFiles-Migrated\\Documents\\SR automation review\\Search\\ris (6).ris")    
 def test_parse_multiline_ris(self):
     filepath = os.path.join(CURRENT_DIR, 'multiline.ris')
     result_entry = self.nice_keys({
         'TY':
         'JOUR',
         'AU': ['Shannon,Claude E.'],
         'PY':
         '1948/07//',
         'TI':
         'A Mathematical Theory of Communication',
         'JF':
         'Bell System Technical Journal',
         'N2':
         'first line, then second line and at the end the last line',
         'N1': ['first line', '* second line', '* last line'],
         'SP':
         '379',
         'EP':
         '423',
         'VL':
         '27',
     })
     with open(filepath, 'r') as f:
         entries = list(readris(f))
         self.compare([result_entry], entries)
def read_ris(fp, labels=None):
    """RIS file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the RIS file.
    label: bool
        Check for label. If None, this is automatic.

    Returns
    -------
    list:
        List with entries.

    """

    # build a map of the tags
    mapping = TAG_KEY_MAPPING

    if labels:
        mapping[RIS_KEY_LABEL_INCLUDED] = NAME_LABEL_INCLUDED

    with open(fp, 'r') as bibliography_file:
        entries = list(readris(bibliography_file, mapping=mapping))

    return entries
示例#4
0
 def __init__(self, f):
     if isinstance(f, str):
         f = open(f, "r")
     else:
         f = f
     reader = readris(f, mapping=self.get_mapping())
     contents = [content for content in reader]
     f.close()
     self.raw_references = contents
示例#5
0
文件: ris.py 项目: JoshAddington/hawc
 def __init__(self, fileObj):
     if isinstance(fileObj, basestring):
         f = open(fileObj, 'r')
     else:
         f = fileObj
     reader = readris(f, mapping=self.get_mapping())
     contents = [content for content in reader]
     f.close()
     self.raw_references = contents
示例#6
0
文件: ris.py 项目: JoshAddington/hawc
 def file_readable(cls, fileObj):
     # ensure that file can be successfully parsed
     try:
         reader = readris(fileObj, mapping=cls.get_mapping())
         [content for content in reader]
         fileObj.seek(0)
         return True
     except IOError:
         return False
示例#7
0
 def file_readable(cls, f):
     # ensure that file can be successfully parsed
     try:
         reader = readris(f, mapping=cls.get_mapping())
         [content for content in reader]
         f.seek(0)
         return True
     except IOError as err:
         logging.warning(err)
         return False
    def test_parse_example_full_ris(self):
        filepath = os.path.join(CURRENT_DIR, 'example_full.ris')
        entry1 = {
            'TY': 'JOUR',
            'ID': '12345',
            'T1': 'Title of reference',
            'A1': ['Marx, Karl', 'Lindgren, Astrid'],
            'A2': ['Glattauer, Daniel'],
            'Y1': '2014//',
            'N2':
            'BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.  RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.',
            'KW': ['Pippi', 'Nordwind', 'Piraten'],
            'JF': 'Lorem',
            'JA': 'lorem',
            'VL': '9',
            'IS': '3',
            'SP': 'e0815',
            'CY': 'United States',
            'PB': 'Fun Factory',
            'SN': '1932-6208',
            'M1': '1008150341',
            'L2': 'http://example.com',
            'UR': 'http://example_url.com',
        }

        entry2 = {
            'TY': 'JOUR',
            'ID': '12345',
            'T1': 'The title of the reference',
            'A1': ['Marxus, Karlus', 'Lindgren, Astrid'],
            'A2': ['Glattauer, Daniel'],
            'Y1': '2006//',
            'N2':
            'BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.  RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.',
            'KW': ['Pippi Langstrumpf', 'Nordwind', 'Piraten'],
            'JF': 'Lorem',
            'JA': 'lorem',
            'VL': '6',
            'IS': '3',
            'SP': 'e0815341',
            'CY': 'Germany',
            'PB': 'Dark Factory',
            'SN': '1732-4208',
            'M1': '1228150341',
            'L2': 'http://example2.com',
            'UR': 'http://example_url.com',
        }

        results = self.nice_list([entry1, entry2])
        with open(filepath, 'r') as bibliography_file:
            entries = list(readris(bibliography_file))
            self.compare(results, entries)
示例#9
0
def create_llrs(filename):

    curated_filename = curate_RIS_MDI_file(filename)

    with open(curated_filename, 'r', encoding="utf-8") as bibliography_file:
        # using https://pypi.python.org/pypi/RISparser
        entries = readris(bibliography_file)
        for entry in entries:
            llrs.add(create_llr_from_RIS(entry))

    print("**********************************")
    print("new records (Set)= %d" % (len(llrs)))
    return llrs
示例#10
0
 def setup(self):
     catalog_a = self.doc.find('a', href=self.CATALOG)
     if catalog_a:
         catalog_num = self.CATALOG.search(catalog_a['href']).group(1)
         ris_url = 'https://catalog.hathitrust.org/Search/SearchExport?handpicked={}&method=ris'.format(
             catalog_num)
         response = requests.get(
             ris_url, headers={"User-Agent": settings.USER_AGENT})
         records = readris(response.text.splitlines()
                           ) if response.status_code == 200 else []
         for record in records:
             self.record = record
             return
         self.record = {}
示例#11
0
def create_llrs(filename):

    curated_filename = curate_RIS_MDI_file(filename)
    
    llrs = set()
    with open(curated_filename, 'r') as bibliography_file:
        # using https://pypi.python.org/pypi/RISparser 
        entries = readris(bibliography_file)
        for entry in entries:
            llrs.add(create_llr_from_RIS(entry))

    print "**********************************"
    print "new records (Set)= %d" %(len(llrs))
    return llrs
示例#12
0
    def test_parse_example_full_ris(self):
        filepath = os.path.join(CURRENT_DIR, 'example_full.ris')
        entry1 = {
            'TY': 'JOUR',
            'ID': '12345',
            'T1': 'Title of reference',
            'A1': ['Marx, Karl', 'Lindgren, Astrid'],
            'A2': ['Glattauer, Daniel'],
            'Y1': '2014//',
            'N2': 'BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.  RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.',
            'KW': ['Pippi', 'Nordwind', 'Piraten'],
            'JF': 'Lorem',
            'JA': 'lorem',
            'VL': '9',
            'IS': '3',
            'SP': 'e0815',
            'CY': 'United States',
            'PB': 'Fun Factory',
            'SN': '1932-6208',
            'M1': '1008150341',
            'L2': 'http://example.com',
            'UR': 'http://example_url.com',
        }

        entry2 = {
            'TY': 'JOUR',
            'ID': '12345',
            'T1': 'The title of the reference',
            'A1': ['Marxus, Karlus', 'Lindgren, Astrid'],
            'A2': ['Glattauer, Daniel'],
            'Y1': '2006//',
            'N2': 'BACKGROUND: Lorem dammed ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.  RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.',
            'KW': ['Pippi Langstrumpf', 'Nordwind', 'Piraten'],
            'JF': 'Lorem',
            'JA': 'lorem',
            'VL': '6',
            'IS': '3',
            'SP': 'e0815341',
            'CY': 'Germany',
            'PB': 'Dark Factory',
            'SN': '1732-4208',
            'M1': '1228150341',
            'L2': 'http://example2.com',
            'UR': 'http://example_url.com',
        }

        results = self.nice_list([entry1, entry2])
        with open(filepath, 'r') as bibliography_file:
            entries = list(readris(bibliography_file))
            self.compare(results, entries)
示例#13
0
	def parse(self, response):
		"""
		Parse .ris file and submit article to DynamoDB table.
		"""
		filepath = 'Abstracts.ris'

		with open(filepath, 'r', encoding="utf-8") as bibliography_file:
			entries = readris(bibliography_file)

			for entry in entries:

				#Map enries into article.
				article = Article()

				if 'accession_number' in entry:
					article['id'] = re.sub('[\s+]', '', entry['accession_number'])

				# Add title
				if 'translated_title' in entry:
					article['title'] = entry['translated_title']
				else:
					article['title'] = entry['title']

				# Add authors
				if 'authors' in entry:
					article['authors'] = entry['authors']

				# Add abstract
				if 'abstract' in entry:
					article['abstract'] = entry['abstract']

				# Add year of publishing
				if 'year' in entry:
					article['release_date'] = entry['year']

				# Add type of article
				if 'type_of_reference' in entry:
					article['article_type'] = entry['type_of_reference']

				# Add keywords
				if 'keywords' in entry:
					article['keywords'] = entry['keywords']

				# Add last time fetched by bot.
				article['last_update'] = int(time.mktime(self.now.timetuple()))

				# This line push the item through the pipeline.
				yield article
示例#14
0
    def test_parse_example_basic_ris(self):
        filepath = os.path.join(CURRENT_DIR, 'example_basic.ris')
        result_entry = self.nice_keys({
            'TY': 'JOUR',
            'AU': ['Shannon,Claude E.'],
            'PY': '1948/07//',
            'TI': 'A Mathematical Theory of Communication',
            'JF': 'Bell System Technical Journal',
            'SP': '379',
            'EP': '423',
            'VL': '27',
        })

        with open(filepath, 'r') as bibliography_file:
            entries = list(readris(bibliography_file))
            self.compare([result_entry], entries)
示例#15
0
    def test_parse_example_basic_ris(self):
        filepath = os.path.join(CURRENT_DIR, 'example_basic.ris')
        result_entry = self.nice_keys({
            'TY': 'JOUR',
            'AU': ['Shannon,Claude E.'],
            'PY': '1948/07//',
            'TI': 'A Mathematical Theory of Communication',
            'JF': 'Bell System Technical Journal',
            'SP': '379',
            'EP': '423',
            'VL': '27',
        })

        with open(filepath, 'r') as bibliography_file:
            entries = list(readris(bibliography_file))
            self.compare([result_entry], entries)
示例#16
0
 def test_parse_multiline_ris(self):
     filepath = os.path.join(CURRENT_DIR, 'multiline.ris')
     result_entry = self.nice_keys({
         'TY': 'JOUR',
         'AU': ['Shannon,Claude E.'],
         'PY': '1948/07//',
         'TI': 'A Mathematical Theory of Communication',
         'JF': 'Bell System Technical Journal',
         'N2': 'first line, then second line and at the end the last line',
         'N1': ['first line', '* second line', '* last line'],
         'SP': '379',
         'EP': '423',
         'VL': '27',
     })
     with open(filepath, 'r') as f:
         entries = list(readris(f))
         self.compare([result_entry], entries)
示例#17
0
def read_ris(fp):
    """RIS file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the RIS file.
    label: bool
        Check for label. If None, this is automatic.

    Returns
    -------
    list:
        List with entries.

    """

    encodings = ['ISO-8859-1', 'utf-8', 'utf-8-sig']
    entries = None
    for encoding in encodings:
        try:
            with open(fp, 'r', encoding=encoding) as bibliography_file:
                mapping = _tag_key_mapping(reverse=False)
                entries = list(readris(bibliography_file, mapping=mapping))
                break
        except UnicodeDecodeError:
            pass
        except IOError as e:
            logging.warning(e)

    if entries is None:
        raise ValueError("Cannot find proper encoding for data file.")

    df = pd.DataFrame(entries)

    def converter(x):
        try:
            return ", ".join(x)
        except TypeError:
            return ""

    for tag in LIST_TYPE_TAGS:
        key = TAG_KEY_MAPPING[tag]
        if key in df:
            df[key] = df[key].apply(converter)
    return standardize_dataframe(df)
示例#18
0
def main(filename):
    with open('cytuj.mustache') as template_file:
        template = template_file.read()

    with open(filename) as bib_file:
        entries = readris(bib_file)
        for e in entries:
            print(e)
            for i, a in enumerate(e['authors']):
                surname, name = a.split(",", 1)
            e['name'] = name
            e['surname'] = surname
            e['znaczenie'] = e.get('custom1')
            e['pismo'] = e.get('secondary_name', e.get('journal_name'))
            title = e.get('title', e.get('primary_title'))
            e['title'] = title

            print("{{" + pystache.render(template, e) + "}}")
示例#19
0
def read_ris(fp):
    """RIS file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the RIS file.
    label: bool
        Check for label. If None, this is automatic.

    Returns
    -------
    list:
        List with entries.

    """

    encodings = ['ISO-8859-1', 'utf-8', 'utf-8-sig']
    entries = None
    for encoding in encodings:
        try:
            with open(fp, 'r', encoding=encoding) as bibliography_file:
                mapping = _tag_key_mapping(reverse=False)
                entries = list(readris(bibliography_file, mapping=mapping))
                break
        except (UnicodeDecodeError, IOError):
            pass

    if entries is None:
        raise ValueError("Cannot find proper encoding for data file.")

    df = pd.DataFrame(entries)
    if "keywords" in df:

        def converter(x):
            try:
                return ", ".join(x)
            except TypeError:
                return ""

        df["keywords"] = df["keywords"].apply(converter)
    return standardize_dataframe(df)
示例#20
0
    def test_parse_multiple_unknown_tags_ris(self):
        filepath = os.path.join(CURRENT_DIR, 'example_multi_unknown_tags.ris')
        unknowns = defaultdict(list)
        unknowns['JP'].append('CRISPR')
        unknowns['DC'].append('Direct Current')
        result_entry = self.nice_keys({
            'TY': 'JOUR',
            'AU': ['Shannon,Claude E.'],
            'PY': '1948/07//',
            'TI': 'A Mathematical Theory of Communication',
            'JF': 'Bell System Technical Journal',
            'SP': '379',
            'EP': '423',
            'VL': '27',
            # {'JP': ['CRISPR'], 'DC': ['Direct Current']}
            'UK': unknowns,
        })

        with open(filepath, 'r') as bibliography_file:
            entries = list(readris(bibliography_file))
            self.compare([result_entry], entries)
示例#21
0
def readRIS(filename):
    with open(filename, 'r') as f:
        entries = readris(f)

    res = []

    for entry in entries:
        entry['author'] = authorListFromListOfAuthors(entry.get('authors', []))
        if 'authors' in entry:
            del entry['authors']

        new_type = 'article'
        if entry.get('type_of_reference'):
            if entry['type_of_reference'] in reverse_type_mapping:
                new_type = reverse_type_mapping[entry['type_of_reference']]

        entry['ENTRYTYPE'] = new_type
        entry = fixBibData(entry, 0)
        res.append(entry)

    return res
示例#22
0
 def test_parse_example_basic_ris(self):
     mapping = TAG_KEY_MAPPING
     filedirpath = os.path.dirname(os.path.realpath(__file__))
     filepath = filedirpath + '/example_basic.ris'
     ristags = [
         {
             'TY': 'JOUR'
         },
         {
             'AU': ['Shannon,Claude E.']
         },
         {
             'PY': '1948/07//'
         },
         {
             'TI': 'A Mathematical Theory of Communication'
         },
         {
             'JF': 'Bell System Technical Journal'
         },
         {
             'SP': '379'
         },
         {
             'EP': '423'
         },
         {
             'VL': '27'
         },
     ]
     with open(filepath, 'r') as bibliography_file:
         entries = list(readris(bibliography_file))
         assert len(entries)
         for ristag in ristags:
             k, v = ristag.popitem()
             k = mapping[k]
             if isinstance(entries[0][k], list):
                 assert ''.join(v) == ''.join(entries[0][k])
             else:
                 assert v == entries[0][k].strip()
示例#23
0
def read_ris(fp):
    """RIS file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the RIS file.
    label: bool
        Check for label. If None, this is automatic.

    Returns
    -------
    list:
        List with entries.

    """

    with open(fp, 'r') as bibliography_file:
        mapping = _tag_key_mapping(reverse=False)
        entries = list(readris(bibliography_file, mapping=mapping))

    return entries
示例#24
0
 def test_parse_example_full_ris(self):
     mapping = TAG_KEY_MAPPING
     filedirpath = os.path.dirname(os.path.realpath(__file__))
     filepath = filedirpath + '/example_full.ris'
     ristags = [
         {'TY': 'JOUR'},
         {'ID': '12345'},
         {'T1': 'Title of reference'},
         {'A1': ['Marx, Karl', 'Lindgren, Astrid']},
         {'A2': ['Glattauer, Daniel']},
         {'Y1': '2014//'},
         {'N2': 'BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.  RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.'},
         {'KW': ['Pippi', 'Nordwind', 'Piraten']},
         {'JF': 'Lorem'},
         {'JA': 'lorem'},
         {'VL': '9'},
         {'IS': '3'},
         {'SP': 'e0815'},
         {'CY': 'United States'},
         {'PB': 'Fun Factory'},
         {'SN': '1932-6208'},
         {'M1': '1008150341'},
         {'L2': 'http://example.com'},
         {'UR': 'http://example_url.com'},
     ]
     with open(filepath, 'r') as bibliography_file:
         entries = list(readris(bibliography_file))
         assert len(entries) == 2
         for ristag in ristags:
             k, v = ristag.popitem()
             k = mapping[k]
             assert k in entries[0]
             if isinstance(entries[0][k], types.ListType):
                 assert ''.join(v) == ''.join(entries[0][k])
             else:
                 assert v == entries[0][k].strip()
示例#25
0
 def test_parse_example_basic_ris(self):
     mapping = TAG_KEY_MAPPING
     filedirpath = os.path.dirname(os.path.realpath(__file__))
     filepath = filedirpath + '/example_basic.ris'
     ristags = [
         {'TY': 'JOUR'},
         {'AU': ['Shannon,Claude E.']},
         {'PY': '1948/07//'},
         {'TI': 'A Mathematical Theory of Communication'},
         {'JF': 'Bell System Technical Journal'},
         {'SP': '379'},
         {'EP': '423'},
         {'VL': '27'},
     ]
     with open(filepath, 'r') as bibliography_file:
         entries = list(readris(bibliography_file))
         assert len(entries)
         for ristag in ristags:
             k, v = ristag.popitem()
             k = mapping[k]
             if isinstance(entries[0][k], types.ListType):
                 assert ''.join(v) == ''.join(entries[0][k])
             else:
                 assert v == entries[0][k].strip()
示例#26
0
def convert():
    parser = BibTexParser(common_strings=False)
    parser.ignore_nonstandard_types=False
    parser.homogenize_fields= False
    
    if request.method== 'POST':
        file = request.files['filer_input']
        if file.filename == '':
            flash('no selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            if file and file_bib(file.filename):
                bibtex_file = file.stream
                bibtex_str = bibtex_file.read() 
                bib_database = bibtexparser.loads(bibtex_str, parser)
                df = pd.DataFrame(bib_database.entries)
                df.index += 1
                return render_template_string(html, filenm = file.filename, 
                                                table = df.to_html(index='Nomor', header='true', table_id='example', 
                                                classes='table table-striped table-bordered'))
            elif file and file_ris(file.filename):
                ris_file = file.stream.read()
                reader = io.BytesIO(ris_file)
                wrapper = io.TextIOWrapper(reader, encoding='utf-8')
                entries = readris(wrapper) 
                df = pd.DataFrame(entries)
                df['publication_year'] = df['publication_year'].apply(remove_punch)
                df.index += 1
                return render_template_string(html, filenm = file.filename, 
                                                table = df.to_html(header='true', table_id='example', classes='table table-striped table-bordered'))
            else:
                data = xmltodict.parse(file)
                json_data = json.dumps(data)
                wanda = pd.read_json(StringIO(json_data))
                vision = wanda["xml"]['records']['record']
                axel = pd.DataFrame(vision)
                if axel.index[0] == '@name':
                    if axel['database'] is not None:
                        del axel['database']
                        # axel.index +=1
                    else:
                        None
                else:
                    if axel['database'] is not None:
                        del axel['database']
                    else:
                        None
                    #for get type value
                    type_data = []
                    for i, each in enumerate(axel['ref-type']):
                        if '@name' in axel['ref-type'][i].keys():
                            try:
                                data_type = axel['ref-type'][i]['@name']
                                type_data.append(data_type)
                            except Exception as e:
                                print(e)
                    axel['ref-type'] = type_data
                    #for get author value
                    authors = []
                    for i, each in enumerate(axel['contributors']):
                        if 'authors' in axel['contributors'][i].keys():
                            try:
                                data_author = axel['contributors'][i]['authors']['author']
                                authors.append(data_author)
                            except Exception as e:
                                print(e)
                    axel['contributors'] = authors
                    # for get title
                    titles=[]
                    for i, each in enumerate(axel['titles']):
                        title_data = axel['titles'][i].values()
                        titles.append(title_data)
                    axel['titles']= titles
                    # for get full-tittle
                    periodical=[]
                    for i, each in enumerate(axel['periodical']):
                        hulk = axel['periodical'][i]
                        if hulk is not None: # else hulk = 'NaN' 
                            periodical_data = hulk.values()
                        periodical.append(periodical_data)
                    axel['periodical']= periodical
                    # FOR GET keywords
                    # keyw=[]
                    # for i, each in enumerate(axel['keywords']):
                    #     hawk_eye = axel['keywords'][i]
                    #     if hawk_eye is not None: # else hulk = 'NaN' 
                    #         keyw_data = hawk_eye.values()
                    # keyw.append(keyw_data)
                    # axel['keywords']= keyw
                    axel.index += 1
                return render_template_string(html, filenm = file.filename, 
                                                table = axel.to_html(header='true', table_id='example', 
                                                classes='table table-striped table-bordered'))
        else:
            return render_template('output1.html')
    return render_template('index.html')
示例#27
0
from RISparser import readris, config

path_old = 'databases2015.ris'
path_new = 'databases2017.ris'

overlap_entries = 0
new_entries = []

with open(path_old, 'r') as bibfile_old, open(path_new, 'r') as bibfile_new:
    entries_old = list(readris(bibfile_old))
    entries_new = list(readris(bibfile_new))

    print('Entries in old file: %d' % len(entries_old))
    print('Entries in new file: %d' % len(entries_new))

    for entry in entries_new:
        if len([x for x in entries_old if x == entry]):
            overlap_entries = overlap_entries + 1
        else:
            new_entries.append(entry)

print('Overlapping entries: %d' % overlap_entries)
print('New entries: %d' % len(new_entries))

tag_key_mapping = dict(
    zip(config.TAG_KEY_MAPPING.values(), config.TAG_KEY_MAPPING.keys()))

with open('new_entries.ris', 'w') as out:
    for entry in new_entries[1:5]:
        for key, value in entry.iteritems():
            if key in tag_key_mapping:
示例#28
0
from sets import Set

path_old = 'databases2015.ris'
path_new = 'databases2017.ris'

overlap_entries = 0
new_entries = []


class hashabledict(dict):
    def __hash__(self):
        return hash(frozenset(self))


with open(path_old, 'r') as bibfile_old, open(path_new, 'r') as bibfile_new:
    entries_old = Set([hashabledict(x) for x in readris(bibfile_old)])
    entries_new = [hashabledict(x) for x in list(readris(bibfile_new))]

    print('Entries in old file: %d' % len(entries_old))
    print('Entries in new file: %d' % len(entries_new))

    for entry in entries_new:
        if entry in entries_old:
            overlap_entries = overlap_entries + 1
        else:
            new_entries.append(entry)

print('Overlapping entries: %d' % overlap_entries)
print('New entries: %d' % len(new_entries))

tag_key_mapping = dict(
示例#29
0
文件: ananse.py 项目: baasare/ananse
    def import_naive_results(self,
                             path,
                             save_dataset=False,
                             save_directory=None,
                             clean_dataset=False):
        """
        This method imports the search results from a specified path

        :param clean_dataset: if TRUE, de-duplicates search results after importing
        :param save_dataset: if TRUE, saves the full search results to a .csv
        :param save_directory: the path to a directory where search results will be saved if save_dataset is set to TRUE
        :param path: path containing the naive search results files
        :return: a pandas data frame consisting of assembled search results
        """

        wos_li = []
        scopus_li = []
        jstor_li = []

        # import all files in the directory, automatically detect on file name extension [supports WOS and Scopus]

        all_files = glob.glob(path + "*")
        for filename in all_files:
            if filename.endswith('.txt'):
                df = pd.read_csv(filename,
                                 index_col=False,
                                 delimiter="\t",
                                 quotechar=None,
                                 quoting=3,
                                 encoding='utf-8')
                wos_li.append(df)
            elif filename.endswith('.csv'):
                df = pd.read_csv(filename)
                scopus_li.append(df)
            elif filename.endswith('.ris'):
                with open(filename, 'r', encoding='utf-8') as f:
                    lines = f.readlines()

                with open(filename, "w", encoding='utf-8') as f:
                    strings = ("Provider", "Database", "Content")
                    for line in lines:
                        if not any(s in line for s in strings):
                            f.write(line)

                with open(filename, 'r',
                          encoding='utf-8') as bibliography_file:
                    entries = list(readris(bibliography_file))
                    for entry in entries:
                        jstor_li.append(entry)

        if len(
                wos_li
        ) != 0:  # check to see if any web of science results was imported

            # concatenate files from same database into DataFrame
            wos_dataset = pd.concat(wos_li, axis=0, ignore_index=True)

            # merge article titles and abstract to form text from which keywords will be extracted
            text = list(wos_dataset[['AB', 'TI']].apply(
                lambda x: '{}{}'.format(x[0], x[1]), axis=1))
            wos_text = [self.remove_punctuations(t) for t in text]

            self.search_results['id'] = list(wos_dataset['UT'])
            self.search_results['text'] = wos_text
            self.search_results['title'] = list(wos_dataset['TI'])
            self.search_results['abstract'] = list(wos_dataset['AB'])
            self.search_results['keywords'] = list(wos_dataset['DE'])
            self.search_results['type'] = list(wos_dataset['DT'])
            self.search_results['authors'] = list(wos_dataset['AU'])
            self.search_results['affiliation'] = list(wos_dataset['C1'])
            self.search_results['source'] = list(wos_dataset['SO'])
            self.search_results['year'] = list(wos_dataset['PY'])
            self.search_results['volume'] = list(wos_dataset['VL'])
            self.search_results['issue'] = list(wos_dataset['IS'])
            self.search_results['startpage'] = list(wos_dataset['BP'])
            self.search_results['endpage'] = list(wos_dataset['EP'])
            self.search_results['doi'] = list(wos_dataset['DI'])
            self.search_results['language'] = list(wos_dataset['LA'])
            self.search_results['database'] = (['WOS'] *
                                               len(wos_dataset.index))

        if len(scopus_li
               ) != 0:  # check to see if any scopus results was imported

            # concatenate files from same database into DataFrame
            scopus_dataset = pd.concat(scopus_li, axis=0, ignore_index=True)

            text = list(scopus_dataset[['Abstract', 'Title']].apply(
                lambda x: '{}{}'.format(x[0], x[1]), axis=1))
            scopus_text = [self.remove_punctuations(t) for t in text]

            self.search_results['id'] = self.search_results['id'] + list(
                scopus_dataset['EID'])
            self.search_results[
                'text'] = self.search_results['text'] + scopus_text
            self.search_results['title'] = self.search_results['title'] + list(
                scopus_dataset['Title'])
            self.search_results['abstract'] = self.search_results[
                'abstract'] + list(scopus_dataset['Abstract'])
            self.search_results['keywords'] = self.search_results[
                'keywords'] + list(scopus_dataset['Author Keywords'])
            self.search_results['type'] = self.search_results['type'] + list(
                scopus_dataset['Document Type'])
            self.search_results['authors'] = self.search_results[
                'authors'] + list(scopus_dataset['Authors'])
            self.search_results['affiliation'] = self.search_results[
                'affiliation'] + list(scopus_dataset['Affiliations'])
            self.search_results['source'] = self.search_results[
                'source'] + list(scopus_dataset['Source'])
            self.search_results['year'] = self.search_results['year'] + list(
                scopus_dataset['Year'])
            self.search_results['volume'] = self.search_results[
                'volume'] + list(scopus_dataset['Volume'])
            self.search_results['issue'] = self.search_results['issue'] + list(
                scopus_dataset['Issue'])
            self.search_results['startpage'] = self.search_results[
                'startpage'] + list(scopus_dataset['Page start'])
            self.search_results['endpage'] = self.search_results[
                'endpage'] + list(scopus_dataset['Page end'])
            self.search_results['doi'] = self.search_results['doi'] + list(
                scopus_dataset['DOI'])
            self.search_results['language'] = self.search_results[
                'language'] + (['English'] * len(scopus_dataset.index))
            self.search_results['database'] = self.search_results[
                'database'] + list(scopus_dataset['Source'])

        if len(jstor_li
               ) != 0:  # check to see if any jstor results was imported
            # concatenate files from same database into DataFrame
            jstor_dataset = pd.DataFrame(jstor_li)

            i = 0
            for a, b in zip(jstor_dataset.abstract, jstor_dataset.authors):
                jstor_dataset.at[i, 'abstract'] = str(a).translate(
                    {ord(c): ''
                     for c in self.bad_chars})
                jstor_dataset.at[i, 'authors'] = str(b).translate(
                    {ord(c): ''
                     for c in self.bad_chars})
                i = i + 1

            text = list(jstor_dataset[['abstract', 'title']].apply(
                lambda x: '{}{}'.format(x[0], x[1]), axis=1))
            jstor_text = [self.remove_punctuations(t) for t in text]

            self.search_results['id'] = self.search_results['id'] + list(
                jstor_dataset['issn'])
            self.search_results[
                'text'] = self.search_results['text'] + jstor_text
            self.search_results['title'] = self.search_results['title'] + list(
                jstor_dataset['title'])
            self.search_results['abstract'] = self.search_results[
                'abstract'] + list(jstor_dataset['abstract'])
            self.search_results['keywords'] = self.search_results[
                'keywords'] + (['NaN'] * len(jstor_dataset.index))
            self.search_results['type'] = self.search_results['type'] + list(
                jstor_dataset['type_of_reference'])
            self.search_results['authors'] = self.search_results[
                'authors'] + list(jstor_dataset['type_of_reference'])
            self.search_results['affiliation'] = self.search_results[
                'affiliation'] + (['NaN'] * len(jstor_dataset.index))
            self.search_results['source'] = self.search_results[
                'source'] + list(jstor_dataset['name_of_database'])
            self.search_results['year'] = self.search_results['year'] + list(
                jstor_dataset['year'])
            self.search_results['volume'] = self.search_results[
                'volume'] + list(jstor_dataset['volume'])
            self.search_results['issue'] = self.search_results['issue'] + list(
                jstor_dataset['number'])
            self.search_results['startpage'] = self.search_results[
                'startpage'] + list(jstor_dataset['start_page'])
            self.search_results['endpage'] = self.search_results[
                'endpage'] + list(jstor_dataset['end_page'])
            self.search_results['doi'] = self.search_results['doi'] + list(
                jstor_dataset['doi'])
            self.search_results['language'] = self.search_results[
                'language'] + (['English'] * len(jstor_dataset.index))
            self.search_results['database'] = self.search_results[
                'database'] + list(jstor_dataset['name_of_database'])

        # merge all search database results into a dataframe

        data_frame = pd.DataFrame.from_dict(self.search_results)

        if save_dataset is True:
            if save_directory is not None:
                data_frame.to_csv(r"{}".format(save_directory +
                                               'imported_results.csv'),
                                  index=False)
            else:
                data_frame.to_csv('data/IMPORTED/imported_results.csv',
                                  index=False)

        if clean_dataset is True:
            data_frame = self.deduplicate_dataframe(data_frame, self.columns)

        return data_frame
示例#30
0
 def test_starting_newline(self):
     fn = os.path.join(CURRENT_DIR, 'example_starting_newlines.ris')
     with open(fn, 'r') as f:
         entries = list(readris(f))
     assert len(entries) == 1
示例#31
0
 def test_parse_wos_ris(self):
     fn = os.path.join(CURRENT_DIR, 'example_wos.ris')
     with open(fn, 'r') as f:
         entries = list(readris(f, wok=True))
     assert len(entries) == 2
示例#32
0
def load_entries_from_ris(ris_fpath):
    ris_fpath = fnmstr(ris_fpath)
    with open(ris_fpath, "r") as read:
        entries = readris(read)
    return entries
示例#33
0
 def test_parse_example_full_ris(self):
     mapping = TAG_KEY_MAPPING
     filedirpath = os.path.dirname(os.path.realpath(__file__))
     filepath = filedirpath + '/example_full.ris'
     ristags = [
         {
             'TY': 'JOUR'
         },
         {
             'ID': '12345'
         },
         {
             'T1': 'Title of reference'
         },
         {
             'A1': ['Marx, Karl', 'Lindgren, Astrid']
         },
         {
             'A2': ['Glattauer, Daniel']
         },
         {
             'Y1': '2014//'
         },
         {
             'N2':
             'BACKGROUND: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.  RESULTS: Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. CONCLUSIONS: Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium.'
         },
         {
             'KW': ['Pippi', 'Nordwind', 'Piraten']
         },
         {
             'JF': 'Lorem'
         },
         {
             'JA': 'lorem'
         },
         {
             'VL': '9'
         },
         {
             'IS': '3'
         },
         {
             'SP': 'e0815'
         },
         {
             'CY': 'United States'
         },
         {
             'PB': 'Fun Factory'
         },
         {
             'SN': '1932-6208'
         },
         {
             'M1': '1008150341'
         },
         {
             'L2': 'http://example.com'
         },
         {
             'UR': 'http://example_url.com'
         },
     ]
     with open(filepath, 'r') as bibliography_file:
         entries = list(readris(bibliography_file))
         assert len(entries) == 2
         for ristag in ristags:
             k, v = ristag.popitem()
             k = mapping[k]
             assert k in entries[0]
             if isinstance(entries[0][k], list):
                 assert ''.join(v) == ''.join(entries[0][k])
             else:
                 assert v == entries[0][k].strip()
示例#34
0
def read_ris(q, update):
    r_count = 0
    changed = False
    encoding = None
    with open("{}/{}".format(settings.MEDIA_ROOT, q.query_file.name),
              'r') as f:
        with open("{}/{}_tmp".format(settings.MEDIA_ROOT, q.query_file.name),
                  "w") as ftmp:
            for l in f:
                if "\\ufeff" in repr(l) or "\ufeff" in repr(l):
                    #encoding = 'utf-8-sig'
                    changed = True
                    ftmp.write(l.replace('\\ufeff', '').replace('\ufeff', ''))
                elif l == "EF":
                    changed = True
                elif "Link to the Ovid Full Text or citation:" in l:
                    changed = True
                elif re.compile('^[A-Z][A-Z0-9]  -\n').match(l):
                    changed = True
                    ftmp.write(l.replace('-\n', '- \n'))
                else:
                    ftmp.write(l)
    if changed:
        print("opening edited file")
        fpath = "{}/{}_tmp".format(settings.MEDIA_ROOT, q.query_file.name)
    else:
        fpath = "{}/{}".format(settings.MEDIA_ROOT, q.query_file.name)

    with open(fpath, "r", encoding=encoding) as f:
        entries = readris(f, mapping=RIS_KEY_MAPPING)
        try:
            for e in entries:
                if "py" in e:
                    if type(e["py"] is str):
                        e["py"] = int(e["py"][:4])
                if "unknown_tag" in e:
                    del e["unknown_tag"]
                try:
                    add_scopus_doc(e, q, update)
                    r_count += 1
                except:
                    print(f"couldn't add {e}")
                    return e

        except:
            r_count = 0
            with open(fpath, 'r', encoding='utf-8-sig') as f:
                entries = readris(f, mapping=RIS_KEY_MAPPING)
                for e in entries:
                    if "py" in e:
                        if type(e["py"] is str):
                            e["py"] = int(e["py"][:4])
                    if "tc" in e:
                        if type(e["tc"] is str):
                            digits = re.findall(r"\d+", e["tc"])
                            if len(digits) > 0:
                                e["tc"] = int(digits[0])
                            else:
                                e["tc"] = None
                    if "unknown_tag" in e:
                        del e["unknown_tag"]

                    try:
                        add_scopus_doc(e, q, update)
                        r_count += 1
                    except:
                        print(f"couldn't add {e}")
                        break
    return r_count