def main():
    print('Iniciando...')

    with open('../config.json') as f:
        config = json.load(f)

    publons_data_path = config['publons_data']
    if not exists(publons_data_path):
        makedirs(publons_data_path)

    # Procura dados recuperados previamente
    data_recovered_files = get_files(publons_data_path, 'json')
    data_recovered = []
    if data_recovered_files:
        for data_recovered_file in data_recovered_files:
            usp_id = data_recovered_file.split('_')[0]
            data_recovered.append(usp_id)
    print('Qtd. de dados recuperados previamente: ', len(data_recovered))

    # Escrevendo arquivo com dados recuperados
    data_analysed_file = join(publons_data_path, 'data_analysed.txt')
    with open(data_analysed_file, 'w', newline='') as f:
        f.write(','.join(set(data_recovered)))

    # Deletando arquivo de dados faltantes
    data_missing_file = join(publons_data_path, 'data_missing.txt')
    with open(data_missing_file, 'w', newline='') as f:
        f.write('')

    print('Qtd. de dados no novo arquivo: ', len(data_recovered))
def main():
    all_units = [
        30, 64, 86, 27, 39, 90, 7, 22, 88, 18, 3, 11, 16, 9, 60, 2, 89, 12, 81,
        48, 59, 8, 5, 17, 10, 23, 25, 58, 6, 74, 93, 14, 41, 42, 55, 4, 31, 43,
        76, 44, 45, 83, 47, 46, 75, 87, 21, 71, 32, 38, 33, 1
    ]

    all_units.sort()

    with open('../config.json') as f:
        config = json.load(f)

    if not exists(config['depts']):
        makedirs(config['depts'])

    files = get_files(config['depts'], 'json')
    saved_units = []

    if len(files) > 0:
        for file in files:
            unit = file.split('_').pop()[:-5]
            if unit not in saved_units:
                saved_units.append(int(unit))
        saved_units.sort()
    else:
        print('Nenhum departamento recuperado previamente.')

    print('Iniciando...')
    units = list(set(all_units) - set(saved_units))
    while len(units) > 0:
        units = get_depts(units, config['depts'])
    else:
        print('Dados dos departamentos recuperados.')
Пример #3
0
def main():
    print('Iniciando...')
    wait_time = [3, 5, 7, 9]
    s = requests.Session()

    with open('../config.json') as f:
        config = json.load(f)

    if not exists(config['people']):
        print('Nenhum dado a ser recuperado.')
        return 0

    files = get_files(config['people'], 'csv')

    if len(files) == 0:
        print('Nenhum dado a ser recuperado.')
        return 0

    if not exists(config['publons_info']):
        makedirs(config['publons_info'])

    # Procura departamentos recuperados previamente
    depts_recovered_file = join(config['publons_info'], 'depts_recovered.txt')
    if exists(depts_recovered_file):
        with open(depts_recovered_file, 'r') as f:
            depts = f.read().split(',')
        print('Departamentos recuperados previamente: ', len(depts))
    else:
        depts = []

    publons_file = join(config['publons_info'], 'publons_info.csv')
    if exists(publons_file) == False:
        print('Novo arquivo criado.')
        with open(publons_file, 'w', newline='') as f:
            csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\')
            csv_writer.writerow([
                'usp_id', 'usp_name', 'usp_unit', 'usp_dept', 'publons_id',
                'publons_name'
            ])

    i = 0
    for file in files:
        file_name = file.split('_')
        unit = file_name[0]
        dept = file_name[1][:-4]
        unit_dept = unit + '_' + dept
        researchers_info = []

        if unit_dept in depts:
            print('Departamento recuperado previamente: ',
                  unit_dept,
                  flush=True)
            continue
        else:
            print('Recuperando dados do departamento: ', unit_dept, flush=True)

        with open(join(config['people'], file),
                  'r',
                  newline='',
                  encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            next(csv_reader)
            for row in csv_reader:
                # O elemento 5 da row (linha) contém a url do web of science,que é usada para obtenção do id da API do Publons
                url_wos = row[5]
                # Pega o id para refazer a url
                usp_id = row[7]
                usp_name = remove_accent_mark(
                    (row[9].lower().replace(' ', '-')))
                if url_wos != '':
                    if i == 4:
                        i = 0
                        wait_time = [3, 5, 7, 9]
                        time.sleep(wait_time[i])
                        i = i + 1
                    try:
                        content = s.get(url_wos).text
                    except requests.exceptions.ConnectionError:
                        time.sleep(60)
                        content = s.get(url_wos).text
                    page = BeautifulSoup(content, 'lxml')

                    if page.find('meta', attrs={'property': 'og:url'}) != None:
                        publons_id = page.find(
                            'meta', attrs={'property':
                                           'og:url'})['content'].split('/')[4]

                        publons_name = page.find(
                            'meta', attrs={'property':
                                           'og:url'})['content'].split('/')[5]

                        researchers_info.append({
                            'usp_id': usp_id,
                            'usp_name': usp_name,
                            'usp_unit': unit,
                            'usp_dept': dept,
                            'publons_id': publons_id,
                            'publons_name': publons_name
                        })
                    else:
                        researchers_info.append({
                            'usp_id': usp_id,
                            'usp_name': usp_name,
                            'usp_unit': unit,
                            'usp_dept': dept,
                            'publons_id': 'missing_id',
                            'publons_name': ''
                        })
                else:
                    researchers_info.append({
                        'usp_id': usp_id,
                        'usp_name': usp_name,
                        'usp_unit': unit,
                        'usp_dept': dept,
                        'publons_id': 'missing_id',
                        'publons_name': ''
                    })

        if researchers_info:
            with open(publons_file, 'a', newline='') as f:
                csv_writer = csv.writer(f,
                                        quoting=csv.QUOTE_NONE,
                                        escapechar='\\')
                for researcher_info in researchers_info:
                    csv_writer.writerow([
                        researcher_info['usp_id'], researcher_info['usp_name'],
                        researcher_info['usp_unit'],
                        researcher_info['usp_dept'],
                        researcher_info['publons_id'],
                        researcher_info['publons_name']
                    ])

        print('Dados recuperados do departamento: ', unit_dept, flush=True)
        depts.append(unit_dept)

        with open(depts_recovered_file, 'w', newline='') as f:
            print('Escrevendo arquivo com departamento recuperado...')
            f.write(','.join(depts))

    print('Departamentos recuperados: ', len(depts))
    print('Fim')
def main():
    print('Iniciando...')
    wait_time = [3, 5, 7, 9]
    i = 0
    s = requests.Session()

    with open('../config.json') as f:
        config = json.load(f)

    publons_info_path = config['publons_info']
    if not exists(publons_info_path):
        print('Nenhum dado a ser recuperado.')
        return 0

    publons_file = join(publons_info_path, 'publons_info_unique_filtered.csv')
    if exists(publons_file) == False:
        print('Nenhum dado a ser recuperado.')
        return 0

    publons_data_path = config['publons_data']
    if not exists(publons_data_path):
        makedirs(publons_data_path)

    # Procura dados analisados previamente
    data_analysed_file = join(publons_data_path, 'data_analysed.txt')
    if exists(data_analysed_file):
        with open(data_analysed_file, 'r') as f:
            data_analysed = f.read().split(',')
    else:
        data_analysed = []
    print('Qtd. de dados analisados previamente: ', len(data_analysed))

    # Procura dados recuperados previamente
    data_recovered_files = get_files(publons_data_path, 'json')
    data_recovered = []
    if data_recovered_files:
        for data_recovered_file in data_recovered_files:
            usp_id = data_recovered_file.split('_')[0]
            data_recovered.append(usp_id)
    print('Qtd. de dados recuperados previamente: ', len(data_recovered))

    # Procura dados analisados previamente com problema
    data_missing_file = join(publons_data_path, 'data_missing.txt')
    if exists(data_missing_file):
        with open(data_missing_file, 'r') as f:
            data_missing = f.read().split(',')
        if len(data_missing) == 1 and data_missing.pop() == '':
            data_missing = []
    else:
        data_missing = []
    print('Qtd. de dados com problema previamente: ', len(data_missing))

    count = 0
    with open(publons_file, 'r', newline='') as f:
        csv_reader = csv.reader(f, delimiter=',')
        next(csv_reader)
        if count < 2000:
            for row in csv_reader:
                usp_id = row[0]
                publons_id = row[4]

                if usp_id in data_analysed:
                    continue
                else:
                    print('Recuperando dados de: ',
                          usp_id + '_' + publons_id,
                          flush=True)

                if i == 4:
                    i = 0
                time.sleep(wait_time[i])
                i = i + 1

                url = 'https://publons.com/researcher/api/' + publons_id + '/metrics/individualStats/'
                try:
                    response = s.get(url)
                except requests.exceptions.ConnectionError:
                    time.sleep(60)
                    response = s.get(url)

                count = count + 1
                if response:
                    r = response.json()
                    if 'ready' in r and len(r.keys()) == 1:
                        print('Sem dados.')
                        if usp_id not in data_missing:
                            data_missing.append(usp_id)
                        with open(data_missing_file, 'w', newline='') as f:
                            print('Escrevendo arquivo sem resposta...')
                            f.write(','.join(data_missing))
                    else:
                        with open(join(publons_data_path,
                                       usp_id + '_' + publons_id + '.json'),
                                  'w',
                                  encoding='utf-8') as f:
                            json.dump(response.json(),
                                      f,
                                      ensure_ascii=False,
                                      indent=4)
                        print('Dado recuperado com sucesso.')
                else:
                    print('Sem dados.')
                    if usp_id not in data_missing:
                        data_missing.append(usp_id)
                    with open(data_missing_file, 'w', newline='') as f:
                        print('Escrevendo arquivo sem resposta...')
                        f.write(','.join(data_missing))

                if usp_id not in data_analysed:
                    data_analysed.append(usp_id)
                # Escrevendo arquivo com dado recuperado
                with open(data_analysed_file, 'w', newline='') as f:
                    f.write(','.join(data_analysed))
        else:
            print(
                'Limite atingido. Por favor, execute esse sript novamente em 24 horas.'
            )

    print('------')
    print('Requisitados: ', count)
    print('Qtd. de dados analisados: ', len(data_analysed))
    print('Qtd. de dados recuperados: ',
          len(data_analysed) - len(data_missing))
    print('Sem dados: ', len(data_missing))
    print('Fim')
Пример #5
0
def main():
    print('Iniciando...')

    with open('../config.json') as f:
        config = json.load(f)

    # Verifica se os dados estão disponíveis
    publons_data_path = config['publons_data']
    if not exists(publons_data_path):
        print('Nenhum dado a ser processado.')
        return 0

    files = get_files(publons_data_path, 'json')
    if len(files) == 0:
        print('Nenhum dado a ser processado.')
        return 0

    # Verifica se o path dos resultados foi criado
    publons_results_path = config['publons_results']
    if not exists(publons_results_path):
        makedirs(publons_results_path)

    # Verifica dados recuperados previamente
    publons_result_file = join(publons_results_path,
                               'results_publons' + '.csv')
    heading = [
        'usp_id', 'wos_publications', 'citations', 'citations_per_item',
        'citations_per_year', 'h_index'
    ]
    data_analysed = set()
    if not exists(publons_result_file):
        with open(publons_result_file, 'w', newline='') as f:
            csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE, escapechar='\\')
            csv_writer.writerow(heading)
    else:
        with open(publons_result_file, 'r', newline='') as f:
            csv_reader = csv.reader(f, delimiter=',')
            next(csv_reader)
            for row in csv_reader:
                data_analysed.add(','.join(row))
        print('Qtd. de dados recuperados previamente: ', len(data_analysed))

    print('Processando dados')
    for file in files:
        publons_file = join(publons_data_path, file)
        usp_id = file.split('_')[0]

        # Lê arquivo Publons
        with open(publons_file, 'r', encoding='utf-8') as f:
            if f.read(2) != '[]' and f.read(2) != '':
                f.seek(0)
                data = json.load(f)
            else:
                print('Sem dados de usp_id: ', usp_id, flush=True)
                continue

            if 'numPublicationsInWos' not in data:
                data['numPublicationsInWos'] = ''
            if 'timesCited' not in data:
                data['timesCited'] = ''
            if 'hIndex' not in data:
                data['hIndex'] = ''
            if 'averagePerItem' not in data:
                data['averagePerItem'] = ''
            if 'averagePerYear' not in data:
                data['averagePerYear'] = ''

            information = ','.join([
                usp_id,
                str(data['numPublicationsInWos']),
                str(data['timesCited']),
                str(data['averagePerItem']),
                str(data['averagePerYear']),
                str(data['hIndex'])
            ])
            if information not in data_analysed:
                with open(publons_result_file, 'a', newline='') as f:
                    csv_writer = csv.writer(f,
                                            quoting=csv.QUOTE_NONE,
                                            escapechar='\\')
                    csv_writer.writerow([
                        usp_id, data['numPublicationsInWos'],
                        data['timesCited'], data['averagePerItem'],
                        data['averagePerYear'], data['hIndex']
                    ])

    print('Nro de ids processados: ', len(files))