def get_wos_si_source_data(path_wos_si_source, ignore_extra_cols=True):
    file_wos_si_source = open(path_wos_si_source)
    line = file_wos_si_source.readline()
    cited_forms_with_metadata = set()
    while line:
        els = line.split('|')
        if len(els) != 7:
            print('line is invalid', line, sep="-->")
        else:
            issn = els[0]
            cited_form_1 = StringProcessor.preprocess_journal_title(
                els[1].strip()).upper()
            year = els[2]
            volume = els[3]
            if not ignore_extra_cols:
                cited_form_2 = StringProcessor.preprocess_journal_title(
                    els[4].strip()).upper()

            if issn != '' and year != '' and volume != '':
                if cited_form_1 != '':
                    metadata_str_1 = '|'.join(
                        [issn, cited_form_1, year, volume, ''])
                    cited_forms_with_metadata.add(metadata_str_1)
                if not ignore_extra_cols:
                    if cited_form_2 != '':
                        metadata_str_2 = '|'.join(
                            [issn, cited_form_2, year, volume, ''])
                        cited_forms_with_metadata.add(metadata_str_2)

        line = file_wos_si_source.readline()
    file_wos_si_source.close()
    return cited_forms_with_metadata
Пример #2
0
def extract_keys(citation: Citation):
    """
    Extract a key from a set of citation's fields
    :param citation: a object of the class Citation
    :return: tuple of keys (represented by a comma separated string) where the first key has the first page
    """
    if citation.first_author:
        first_author_first_given_name = StringProcessor.preprocess_name(
            citation.first_author.get('given_names',
                                      '').replace('.',
                                                  ' ').split(' ')[0]).lower()
        if len(first_author_first_given_name) > 0:
            first_author_first_given_name_first_char = first_author_first_given_name[
                0].lower()
        else:
            first_author_first_given_name_first_char = ''
        first_author_last_surname = StringProcessor.preprocess_name(
            citation.first_author.get('surname', '').replace('.', ' ').replace(
                ';', ' ').split(' ')[-1]).lower()
        if first_author_last_surname == '':
            return None, None
    else:
        return None, None

    publication_date = citation.publication_date
    if not publication_date:
        publication_date = ''

    if citation.source:
        journal_title = StringProcessor.preprocess_journal_title(
            citation.source).lower()
    else:
        journal_title = ''

    issue_number = citation.issue
    if not issue_number:
        issue_number = ''

    issue_volume = citation.volume
    if not issue_volume:
        issue_volume = ''

    first_page = citation.first_page
    if not first_page:
        first_page = ''

    major_key = ','.join([
        first_author_first_given_name_first_char, first_author_last_surname,
        publication_date, journal_title, issue_number, issue_volume, first_page
    ])
    minor_key = ','.join([
        first_author_first_given_name_first_char, first_author_last_surname,
        publication_date, journal_title, issue_number, issue_volume
    ])

    return major_key, minor_key
Пример #3
0
def parse_html(html: str):
    """
    Convert a content in html format to a comma-separated-value string.
    :param html: content in html format
    :return: a list of comma-separeted-value strs
    """

    # convert html to soup
    souped_html = bs4.BeautifulSoup(html, features='html.parser')

    # get title
    title = set([
        StringProcessor.preprocess_journal_title(t)
        for t in _search_attribute('title', souped_html)
    ])

    # get title abbreviation
    title_abbrev = set([
        StringProcessor.preprocess_journal_title(t)
        for t in _search_attribute('title abbreviation', souped_html)
    ])

    # get availability
    availability = _search_attribute('availability', souped_html)
    cleaned_availability = _clean_availability(availability)

    # get issn
    issns = [
        _normalize_issn(i.strip().upper())
        for i in set(_search_attribute('issn', souped_html))
    ]

    # get recent issues
    recent_issues = _search_attribute('recent issues', souped_html)

    # clean recent issues
    cleaned_recent_issues = _clean_recent_issues(recent_issues)

    # convert data to comma-string-value lines
    years_volumes_numbers = set(cleaned_recent_issues).union(
        set(cleaned_availability))
    titles = title.union(title_abbrev)

    csv_lines = []
    for t in titles:
        for yvn in years_volumes_numbers:
            for i in issns:
                if i != '':
                    csv_lines.append(i + '|' + t.upper() + '|' + yvn.upper())
    return sorted(set(csv_lines))
def save_char_freq(c2freq: dict):
    """
    Save the char2freq dictionary into the disk
    :param c2freq: a dictionary where each key is a char and each value is composed by the preprocessed version of the char and the char's number of ocurrences
    """
    final_c2freq = open(DEFAULT_DIR_INDEXES + '../char_freq.csv', 'w')
    for k in sorted(c2freq, key=lambda x: c2freq.get(x), reverse=True):
        final_c2freq.write('%s\t%s\t%d' % (k, StringProcessor.preprocess_journal_title(k), c2freq.get(k)) + '\n')
    final_c2freq.close()
Пример #5
0
def extract_citation_data(citation_json: str):
    cit = Citation(citation_json)

    # if the citation is not empty
    if cit.source:

        # we compare only articles
        if cit.publication_type == 'article':

            # preprocess cited journal title
            cit_title_preprocessed = StringProcessor.preprocess_journal_title(cit.source).upper()

            # update dictionary of cited titles
            update_titles(cit_title_preprocessed)

            # collect year for using in year volume base (if needed)
            cit_year = cit.publication_date
            cit_volume = cit.volume

        return cit_title_preprocessed, cit_year, cit_volume
def get_doi2cited_form_dict(path_refs_wos_doi):
    file_refs_wos_doi = open(path_refs_wos_doi)
    line = file_refs_wos_doi.readline()
    doi2cited_form = {}
    while line:
        rels = line.split('|')
        if len(rels) == 6:
            cited_form = StringProcessor.preprocess_journal_title(
                rels[2].strip()).upper()
            doi = rels[5].strip()
            if doi not in doi2cited_form:
                doi2cited_form[doi] = [cited_form]
            else:
                if cited_form not in doi2cited_form[doi]:
                    doi2cited_form[doi].append(cited_form)
        else:
            print('line is invalid', line, sep='-->')
        try:
            line = file_refs_wos_doi.readline()
        except UnicodeDecodeError as udc:
            print('UnicodeDecodeError', udc, line, sep='-->')
    file_refs_wos_doi.close()
    return doi2cited_form
Пример #7
0
    # access local references' database
    refdb = MongoClient()[db_name]

    for col in refdb.list_collection_names():
        print('\nStart %s' % col)
        num_articles = 0
        num_all = 0
        for cjson in refdb[col].find({}):
            cit = Citation(cjson)
            if cit.source:
                if cit.publication_type == 'article':
                    print('\r%d' % num_articles, end='')
                    num_articles += 1

                    cit_title_preprocessed = StringProcessor.preprocess_journal_title(cit.source).upper()
                    cit_year = cit.publication_date
                    cit_volume = cit.volume

                    if cit_title_preprocessed not in TITLES:
                        TITLES[cit_title_preprocessed] = 1
                    else:
                        TITLES[cit_title_preprocessed] += 1

                    # exact match
                    if cit_title_preprocessed in title2issnl:
                        res_issns = title2issnl.get(cit_title_preprocessed)
                        res_line = [col, cit.data.get('_id'), cit_title_preprocessed, res_issns, str(len(res_issns.split('#')))]
                        results.write('\t'.join(res_line) + '\n')

                        res_issns_els = res_issns.split('#')
                    numero = ''
            except UnavailableMetadataException as ume:
                logging.error('ERROR %s' % ume)

            issns = set()
            issns.add(article.journal.electronic_issn)
            issns.add(article.journal.print_issn)
            issns.add(article.journal.scielo_issn)
            issns = [
                i.strip().upper() for i in issns
                if i is not None and i.upper() not in ['', 'ISSN']
            ]

            titles = set()
            titles.add(
                StringProcessor.preprocess_journal_title(
                    article.journal.abbreviated_iso_title))
            titles.add(
                StringProcessor.preprocess_journal_title(
                    article.journal.abbreviated_title))
            titles.add(
                StringProcessor.preprocess_journal_title(
                    article.journal.title))
            titles = [t for t in titles if t is not None and t != '']

            for t in sorted(titles):
                for i in issns:
                    if year != '' and volume != '':
                        row = '|'.join([
                            i,
                            t.upper(),
                            year.strip(),
    def get_doc_attrs(document):
        """
        Returns a list of the document's attributes.
        It is useful for creating/updating dicionaries of metadata2pid.
        """
        pid = document.get('_id')
        xydoc = Article(document)
        document_type = xydoc.document_type.lower()
        first_author = xydoc.first_author

        if first_author is None:
            first_author = {}

        if 'given_names' in first_author:
            first_author_given_names = StringProcessor.preprocess_name(
                first_author.get('given_names', '').lower())
        else:
            first_author_given_names = ''

        if 'surname' in first_author:
            first_author_surname = StringProcessor.preprocess_name(
                first_author.get('surname', '').lower())
        else:
            first_author_surname = ''

        publication_date = xydoc.document_publication_date
        journal_title = StringProcessor.preprocess_journal_title(
            xydoc.journal.title.lower())
        journal_abbrev_title = StringProcessor.preprocess_journal_title(
            xydoc.journal.abbreviated_title.lower())

        journal_issn_ppub = xydoc.journal.print_issn
        if journal_issn_ppub is None:
            journal_issn_ppub = ''

        journal_issn_epub = xydoc.journal.electronic_issn
        if journal_issn_epub is None:
            journal_issn_epub = ''

        try:
            issue_number = xydoc.issue.number
            issue_order = xydoc.issue.order
            issue_volume = xydoc.issue.volume
        except:
            issue_number = ''
            issue_order = ''
            issue_volume = ''

        if issue_number is None:
            issue_number = ''

        if issue_order is None:
            issue_order = ''

        if issue_volume is None:
            issue_volume = ''

        start_page = xydoc.start_page
        if xydoc.start_page is None:
            start_page = ''

        del xydoc

        return [
            pid, document_type, first_author_given_names, first_author_surname,
            publication_date, journal_title, journal_abbrev_title,
            journal_issn_ppub, journal_issn_epub, issue_number, issue_order,
            issue_volume, start_page,
            document.get('collection')
        ]
def get_cited_forms_with_metadata(path_crossref, doi2cited_form: dict):
    file_crossref = open(path_crossref)
    line = file_crossref.readline()
    cited_forms_with_metadata = set()
    not_collected = 0
    while line:
        json_line = json.loads(line)
        doi = json_line.get('url_searched').replace(
            'https://api.crossref.org/works/', '')

        cited_forms = doi2cited_form.get(doi, [])
        if len(cited_forms) == 0:
            not_collected += 1
        else:
            message = json_line.get('message', {})
            if isinstance(message, dict):
                volume = message.get('volume', '')
                issue = StringProcessor.preprocess_journal_title(
                    message.get('issue', '')).upper()
                print_year = str(
                    message.get('journal-issue',
                                {}).get('published-print',
                                        {}).get('date-parts',
                                                [['', '']])[0][0])
                online_year = str(
                    message.get('journal-issue',
                                {}).get('published-online',
                                        {}).get('date-parts',
                                                [['', '']])[0][0])

                issns = message.get('issn-type', [{}])

                print_issn = [
                    i.get('value', '') for i in issns
                    if i.get('type', '') == 'print'
                ]
                if len(print_issn) == 0:
                    print_issn = ''
                elif len(print_issn) == 1:
                    print_issn = print_issn[0]
                else:
                    print('there are multiple online issns %s' %
                          str(print_issn))

                online_issn = [
                    i.get('value', '') for i in issns
                    if i.get('type', '') == 'electronic'
                ]
                if len(online_issn) == 0:
                    online_issn = ''
                elif len(online_issn) == 1:
                    online_issn = online_issn[0]
                else:
                    print('there are multiple online issns %s' %
                          str(online_issn))

                for cit in cited_forms:
                    if print_issn != '' and cit != '' and print_year != '' and volume != '':
                        # in some cases the volume value is composed of two numbers separated by a hyphen
                        if '-' in volume:
                            volume = volume.split('-')[0]
                        metadata_print_str = '|'.join(
                            [print_issn, cit, print_year, volume, issue])
                        cited_forms_with_metadata.add(metadata_print_str)
                    if online_issn != '' and cit != '' and online_year != '' and volume != '':
                        metadata_online_str = '|'.join(
                            [online_issn, cit, online_year, volume, issue])
                        cited_forms_with_metadata.add(metadata_online_str)
        try:
            line = file_crossref.readline()
        except UnicodeDecodeError as udc:
            print('UnicodeDecodeError', udc, line, sep='-->')
    file_crossref.close()
    return cited_forms_with_metadata
def read_base(base_name: str, issn2issnl: dict, mode='create'):
    """
    Read the attributes of a index base
    :param issn2issnl: a dict where each key is a issn and each value is a issn-l
    :param base_name: the name of the index base
    :param mode: the mode of exectution: create_base to create a base and (ii) count to count the number of char's ocurrences
    :return: a dict where each key is a issn-l and each value is a list of one list of issns and one list of titles
    """
    dict_base = {}
    num_ignored_lines = 0

    cols_issn = BASE2COLUMN_INDEXES.get(base_name).get('issn')
    cols_title = BASE2COLUMN_INDEXES.get(base_name).get('title')
    col_country = BASE2COLUMN_INDEXES.get(base_name).get('country')
    base_sep = BASE2COLUMN_INDEXES.get(base_name).get('sep')

    base_data = open(DEFAULT_DIR_INDEXES + base_name + '.csv')

    # ignore first line
    base_data.readline()

    line = base_data.readline()

    if mode == 'count':
        all_original_titles = []

    print('reading base %s' % base_name)

    while line:
        i = line.split(base_sep)

        issns = [i[j].strip().upper() for j in cols_issn if i[j].strip() != '' and is_valid_issn(i[j].strip())]
        issns = list(set([x.replace('-', '') for x in issns if x != '****-****']))

        if has_valid_issn(issns):
            if len(issns) > 0:
                issnl = get_issnl_from_dict(issns, issn2issnl)

                if issnl is not None:
                    titles = list(set([StringProcessor.preprocess_journal_title(i[j].strip(), remove_parenthesis_info=False) for j in cols_title]))
                    titles.extend(list(set([StringProcessor.preprocess_journal_title(i[j].strip()) for j in cols_title])))
                    titles = list(set([t.upper() for t in titles if is_valid_title(t)]))

                    main_title = ''
                    main_abbrev_title = ''

                    if base_name == 'portal_issn':
                        col_main_title = BASE2COLUMN_INDEXES.get(base_name).get('main_title')
                        col_main_title_alternative = BASE2COLUMN_INDEXES.get(base_name).get('main_title_alternative')
                        main_title = StringProcessor.preprocess_journal_title(i[col_main_title].strip()).upper()
                        if main_title == '':
                            main_title = StringProcessor.preprocess_journal_title(i[col_main_title_alternative].strip()).upper()

                        col_main_abbrev_title = BASE2COLUMN_INDEXES.get(base_name).get('main_abbrev_title')
                        main_abbrev_title = StringProcessor.preprocess_journal_title(i[col_main_abbrev_title].strip()).upper()

                    if not DEFAULT_USE_COUNTRY_FROM_DICT:
                        if col_country is not None:
                            country_name = StringProcessor.preprocess_name(i[col_country].strip().upper())
                            if len(country_name) != 0:
                                countries = {country_name}
                            else:
                                countries = set()
                        else:
                            countries = set()

                    if mode == 'count':
                        titles = list(set([i[j].strip() for j in cols_title if is_valid_title(i[j].strip())]))
                        all_original_titles.extend(titles)

                    if issnl != '' and len(titles) > 0:
                        countries = set()
                        if DEFAULT_USE_COUNTRY_FROM_DICT:
                            countries = issnl2country.get(issnl, set())

                        years = issnl2years.get(issnl, set())

                        if issnl not in dict_base:
                            dict_base[issnl] = [issns, [main_title], [main_abbrev_title], titles, countries, years]
                        else:
                            dict_base[issnl][0].extend(issns)
                            dict_base[issnl][0] = list(set(dict_base[issnl][0]))

                            if main_title not in dict_base[issnl][1]:
                                dict_base[issnl][1].append(main_title)
                            if main_abbrev_title not in dict_base[issnl][2]:
                                dict_base[issnl][2].append(main_abbrev_title)

                            dict_base[issnl][3].extend(titles)
                            dict_base[issnl][3] = list(set(dict_base[issnl][3]))
                            dict_base[issnl][4] = dict_base[issnl][4].union(countries)
                            dict_base[issnl][5] = dict_base[issnl][5].union(years)
                else:
                    num_ignored_lines += 1
        else:
            num_ignored_lines += 1

        line = base_data.readline()

    if mode == 'count-char':
        return all_original_titles

    print('\tlines ignored %d' % num_ignored_lines)
    return dict_base