예제 #1
0
def player():
    logging.basicConfig(level=logging.INFO)
    with open(PID_FN, "w") as f:
        pid = getpid()
        f.write(str(pid))
    coll = MongoClient()["for-music-player"].queue

    i = 0
    while True:
        i += 1
        if not coll.count_documents(filter=FILTER):
            print(f"> queue empty. wait {WAIT_SEC} sec...")
            # exit()
            sleep(WAIT_SEC)
        elif MAX_ITERATION_NUM >= 0 and i >= MAX_ITERATION_NUM:
            exit()
        else:
            objs = coll.find(filter=FILTER, sort=[("date", 1)])
            obj = objs[0]
            f, fn = mkstemp(suffix=".mp3")
            coll.update_one({"_id": obj["_id"]},
                            {"$set": {
                                "start": datetime.now()
                            }})
            myexec(f"wget \"{obj['path']}\" -O \"{fn}\"")
            myexec(f"{PLAY_AUDIO_COMMAND} \"{fn}\"")
            close(f)
            coll.update_one({"_id": obj["_id"]},
                            {"$set": {
                                "played": True,
                                "end": datetime.now()
                            }})
예제 #2
0
def main():
    tags = ['gabby', 'dataset1', 'dataset2', 'matthew', 'food science']

    collection = MongoClient(DATABASE_URL).abstracts.all

    for tag in tags:
        count = collection.count_documents({'tags': tag})
        print(f'{tag}: {count}')
예제 #3
0
 def subscribe(self, msg, args):
     """
     Subscribe to repository notifications.
     It takes only one mandatory argument: the respository URL (must be public HTTP or HTTPS).
     """
     # Validate arguments: only the first in URL valid format
     url = self.validURL(args)
     if url:
         collection = MongoClient(host='mongodb').chat0ps.subscriptions
         # First check: if repository exists
         repository = {"repository": url}
         if collection.count_documents(repository) >= 1:
             # Second check: if user is subscribed to this repository
             subscription = {
                 "repository": url,
                 "subscribers": msg.frm.person
             }
             subscriptions = collection.count_documents(subscription)
             if subscriptions >= 1:
                 # No need to update collection.
                 yield "Repository already subscribed."
             else:
                 # Add user to subscribers list
                 collection.update(
                     repository, {"$push": {
                         "subscribers": msg.frm.person
                     }})
                 yield "Done. You may now set repository webhook to: http://35.198.17.35/publish"
         else:
             # If the repository doesn't exists, it's time to create
             # and subscribe the user to it in a single command.
             # Note a little difference here: "subscribers" ust be a list.
             subscription = {
                 "repository": url,
                 "subscribers": [msg.frm.person]
             }
             collection.insert_one(subscription)
             yield "Done."
     else:
         yield "Please inform a valid URL."
예제 #4
0
def update_service_list(services_list):
    kerberus_services = MongoClient()['iacon']['kerberus_services']
    for service in services_list:
        if 'mongo' in service and kerberus_services.count_documents(
            {"_id": ObjectId(service['mongo'])}, limit=1) > 0:
            kerberus_services.update_one({"_id": ObjectId(service['mongo'])},
                                         {"$set": service})
        else:
            kerberus_services.insert_one(service)
            service['mongo'] = str(service['_id'])
            del service['_id']
    with codecs.open("kerberus.json", "w+", "utf8") as file:
        file.write(json.dumps(services_list, indent=4, ensure_ascii=False))
예제 #5
0
 def subscriptions(self, msg, args):
     """
     List all repository subscriptions.
     """
     collection = MongoClient(host='mongodb').chat0ps.subscriptions
     document = {"subscribers": msg.frm.person}
     subscriptions = collection.count_documents(document)
     # Check if there is at least one subscription
     if subscriptions >= 1:
         # Yes, there is. Time to list them all.
         for subscription in collection.find(document):
             yield subscription["repository"]
     else:
         yield "Sorry, no subscribed repository."
예제 #6
0
class CardManager(metaclass=Singleton):
    def __init__(self):
        username = getenv('MONGO_USERNAME')
        password = getenv('MONGO_PASSWORD')
        self.cards = MongoClient(
            f'mongodb://{username}:{password}@mongo:27017/').db.cards

    def add(self, chat_id: int, question, answer) -> None:
        card = {
            'chat_id': chat_id,
            'question': {
                'from_chat_id': question['chat']['id'],
                'message_id': question['message_id']
            },
            'answer': {
                'from_chat_id': answer['chat']['id'],
                'message_id': answer['message_id']
            },
            'level': 0,
            'deadline': datetime.now().timestamp()
        }
        self.cards.insert_one(card)

    def count(self):
        return self.cards.count_documents({})

    def update_level(self, card, value):
        card['level'] = max(
            0, min(card['level'] + value,
                   len(LEVEL_DEADLINES) - 1))
        card['deadline'] = (datetime.now() +
                            LEVEL_DEADLINES[card['level']]).timestamp()
        self.cards.update_one({'_id': card['_id']}, {'$set': card},
                              upsert=False)

    def current_top_for(self, chat_id: int) -> Dict:
        return self.cards.find({
            'chat_id': chat_id
        }).sort(key_or_list='deadline', direction=1)[0]
예제 #7
0
 def unsubscribe(self, msg, args):
     """
     Unsubscribe to repository notifications
     It takes only one mandatory argument: the repository URL.
     """
     url = self.validURL(args)
     if url:
         collection = MongoClient(host='mongodb').chat0ps.subscriptions
         # Check: if subscription exists
         repository = {"repository": url}
         document = {"repository": url, "subscribers": msg.frm.person}
         subscriptions = collection.count_documents(document)
         if subscriptions >= 1:
             collection.update(repository,
                               {"$pop": {
                                   "subscribers": msg.frm.person
                               }})
             yield "Done"
         else:
             yield "Sorry, you're not subscribedto this repository."
     else:
         yield "Please inform a valid URL."
예제 #8
0
class CrossrefAsyncCollector(object):

    logging.basicConfig(level=logging.INFO)

    def __init__(self, email: None, mongo_uri_std_cits=None):
        self.email = email

        if mongo_uri_std_cits:
            try:
                self.persist_mode = 'mongo'
                mongo_col = uri_parser.parse_uri(mongo_uri_std_cits).get(
                    'collection')
                if not mongo_col:
                    mongo_col = MONGO_STDCITS_COLLECTION
                self.standardizer = MongoClient(
                    mongo_uri_std_cits).get_database().get_collection(
                        mongo_col)

                total_docs = self.standardizer.count_documents({})
                logging.info(
                    'There are {0} documents in the collection {1}'.format(
                        total_docs, mongo_col))
            except ConnectionError as e:
                logging.error('ConnectionError %s' % mongo_uri_std_cits)
                logging.error(e)

        else:
            self.persist_mode = 'json'
            file_name_results = 'crossref-results-' + str(
                time.time()) + '.json'
            self.path_results = os.path.join(DIR_DATA, file_name_results)

    def extract_attrs(self, article: Article):
        """
        Extrai os atributos de todas as referências citadas de um documento.

        :param article: documento do qual serão extraídos os atributos das referências citadas
        :return: dicionário de ids de citações e respectivos atributos
        """
        cit_id_to_attrs = {}

        if article.citations:
            for cit in article.citations:
                if cit.publication_type == 'article':
                    cit_id = self.mount_id(cit, article.collection_acronym)
                    cit_attrs = {}

                    if self.persist_mode == 'json':
                        cit_attrs = self._extract_cit_attrs(cit)
                    elif self.persist_mode == 'mongo':
                        cit_data = self.standardizer.find_one({'_id': cit_id})
                        if not cit_data or not cit_data.get('crossref'):
                            cit_attrs = self._extract_cit_attrs(cit)

                    if cit_attrs:
                        cit_id_to_attrs[cit_id] = cit_attrs

        return cit_id_to_attrs

    def _extract_cit_attrs(self, cit: Citation):
        """
        Extrai os atributos de uma referência citada necessários para requisitar metadados CrossRef.

        :param cit: referência citada
        :return: dicionário de atributos para consulta no serviço CrossRef
        """
        if cit.doi:
            valid_doi = preprocess_doi(cit.doi)
            if valid_doi:
                return {'doi': valid_doi}

        attrs = {}

        if cit.first_author:
            first_author_surname = cit.first_author.get('surname', '')
            cleaned_author_surname = preprocess_author_name(
                first_author_surname)
            if cleaned_author_surname:
                attrs.update({'aulast': cleaned_author_surname})

        journal_title = cit.source
        if journal_title:
            cleaned_journal_title = preprocess_journal_title(journal_title)
            if cleaned_journal_title:
                attrs.update({'title': cleaned_journal_title})

        publication_date = html.unescape(
            cit.publication_date) if cit.publication_date else None
        if publication_date and len(publication_date) >= 4:
            publication_year = publication_date[:4]
            if publication_year.isdigit():
                attrs.update({'data': publication_year})

        volume = html.unescape(cit.volume) if cit.volume else None
        if volume:
            attrs.update({'volume': volume})

        issue = html.unescape(cit.issue) if cit.issue else None
        if issue:
            attrs.update({'issue': issue})

        first_page = html.unescape(cit.first_page) if cit.first_page else None
        if first_page:
            attrs.update({'spage': first_page})

        if attrs:
            return attrs

    def parse_crossref_openurl_result(self, text):
        """
        Converte response.text para JSON com metadados obtidos do endpoint OPENURL.

        :param response: resposta de requisição em formato de texto
        :return: JSON com metadados obtidos do serviço CrossRef
        """
        try:
            raw = xmltodict.parse(text)

            for v in raw.get('doi_records', {}).values():
                metadata = v.get('crossref')
                if metadata and 'error' not in metadata.keys():

                    owner = v.get('@owner')
                    if owner:
                        metadata.update({'owner': owner})

                    timestamp = v.get('@timestamp')
                    if timestamp:
                        metadata.update({'timestamp': timestamp})

                    journal_article = metadata.get('journal', {}).get(
                        'journal_article', {})

                    if 'citation_list' in journal_article:
                        journal_article.__delitem__('citation_list')

                    return metadata

        except ExpatError as e:
            logging.warning("ExpatError {0}".format(text))
            logging.warning(e)

    def parse_crossref_works_result(self, raw_metadata):
        """
        Limpa dicionário de metadados obtidos do endpoint WORKS.
        Remove campo de referências

        :param raw_metadata: resposta de requisição em formato de dicionário
        :return: JSON com metadados obtidos do serviço Crossref
        """
        raw_status = raw_metadata.get('status', '')
        if raw_status == 'ok':
            metadata = raw_metadata.get('message')
            if metadata:
                if 'reference' in metadata:
                    metadata.__delitem__('reference')
                return metadata

    def mount_id(self, cit: Citation, collection: str):
        """
        Monta o identificador de uma referência citada.

        :param cit: referência citada
        :param collection: coleção em que a referência foi citada
        :return: código identificador da citação
        """
        cit_id = cit.data['v880'][0]['_']
        return '{0}-{1}'.format(cit_id, collection)

    def save_crossref_metadata(self, id_to_metadata: dict):
        """
        Persiste os metadados da referência citada.

        :param id_to_metadata: dicionário com id da referência citada e seus respectivos metadados Crossref
        """
        if self.persist_mode == 'json':
            with open(self.path_results, 'a') as f:
                json.dump(id_to_metadata, f)
                f.write('\n')

        elif self.persist_mode == 'mongo':
            self.standardizer.update_one(
                filter={'_id': id_to_metadata['_id']},
                update={
                    '$set': {
                        'crossref': id_to_metadata['crossref'],
                        'update-date': datetime.now().strftime('%Y-%m-%d')
                    }
                },
                upsert=True)

    async def run(self, citations_attrs: dict):
        sem = asyncio.Semaphore(CROSSREF_SEMAPHORE_LIMIT)
        tasks = []

        async with ClientSession(headers={'mailto:': self.email}) as session:
            for cit_id, attrs in citations_attrs.items():
                if 'doi' in attrs:
                    url = CROSSREF_URL_WORKS.format(attrs['doi'])
                    mode = 'doi'

                else:
                    url = CROSSREF_URL_OPENURL
                    for k, v in attrs.items():
                        if k != 'doi':
                            url += '&' + k + '=' + v
                    url += '&pid=' + self.email
                    url += '&format=unixref'
                    url += '&multihit=false'
                    mode = 'attrs'

                task = asyncio.ensure_future(
                    self.bound_fetch(cit_id, url, sem, session, mode))
                tasks.append(task)
            responses = asyncio.gather(*tasks)
            await responses

    async def bound_fetch(self, cit_id, url, semaphore, session, mode):
        async with semaphore:
            await self.fetch(cit_id, url, session, mode)

    async def fetch(self, cit_id, url, session, mode):
        try:
            async with session.get(url) as response:
                try:
                    logging.info('Collecting metadata for %s' % cit_id)

                    if mode == 'doi':
                        raw_metadata = await response.json(content_type=None)
                        if raw_metadata:
                            metadata = self.parse_crossref_works_result(
                                raw_metadata)

                    else:
                        raw_metadata = await response.text()
                        if raw_metadata:
                            metadata = self.parse_crossref_openurl_result(
                                raw_metadata)

                    if metadata:
                        id_to_metadata = {'_id': cit_id, 'crossref': metadata}
                        self.save_crossref_metadata(id_to_metadata)
                except JSONDecodeError as e:
                    logging.warning('JSONDecodeError: %s' % cit_id)
                    logging.warning(e)
                except TimeoutError as e:
                    logging.warning('TimeoutError [INNER]: %s' % cit_id)
                    logging.warning(e)
        except ContentTypeError as e:
            logging.warning('ContentTypeError: %s' % cit_id)
            logging.warning(e)
        except ServerDisconnectedError as e:
            logging.warning('ServerDisconnectedError: %s' % cit_id)
            logging.warning(e)
        except TimeoutError as e:
            logging.warning('TimeoutError [OUTER]: %s' % cit_id)
            logging.warning(e)
        except ClientConnectorError as e:
            logging.warning('ClientConectorError: %s' % cit_id)
            logging.warning(e)
class Standardizer:

    logging.basicConfig(level=logging.INFO)

    def __init__(self,
                 path_db,
                 use_exact=False,
                 use_fuzzy=False,
                 mongo_uri_std_cits=None):

        self.use_exact = use_exact
        self.use_fuzzy = use_fuzzy

        if mongo_uri_std_cits:
            try:
                self.persist_mode = 'mongo'
                mongo_col = uri_parser.parse_uri(mongo_uri_std_cits).get(
                    'collection')
                if not mongo_col:
                    mongo_col = MONGO_STDCITS_COLLECTION
                self.standardizer = MongoClient(
                    mongo_uri_std_cits).get_database().get_collection(
                        mongo_col)

                total_docs = self.standardizer.count_documents({})
                logging.info(
                    'There are {0} documents in the collection {1}'.format(
                        total_docs, mongo_col))
            except ConnectionError as e:
                logging.error('ConnectionError %s' % mongo_uri_std_cits)
                logging.error(e)

        else:
            self.persist_mode = 'json'
            file_name_results = 'std-results-' + str(time.time()) + '.json'
            self.path_results = os.path.join(DIR_DATA, file_name_results)

        if path_db:
            logging.info('Loading %s' % path_db)
            self.db = self.load_database(path_db)

    def add_hifen_issn(self, issn: str):
        """
        Insere hífen no ISSN.

        :param issn: ISSN sem hífen
        :return: ISSN com hífen
        """
        if issn:
            return issn[:4] + '-' + issn[4:]

    def load_database(self, path_db: str):
        """
        Carrega na memória o arquivo binário das bases de correção e validação.

        :param path_db: caminho do arquivo binário
        :return: base carregada em formato de dicionário
        """
        try:
            with open(path_db, 'rb') as f:
                return pickle.load(f)
        except FileNotFoundError:
            logging.error('File {0} does not exist'.format(path_db))

    def extract_issnl_from_valid_match(self, valid_match: str):
        """
        Extrai ISSN-L a partir de uma chave ISSN-ANO-VOLUME.
        Caso o ISSN não exista no dicionário issn-to-issnl, considera o próprio ISSN como ISSN-L.

        :param valid_match: chave validada no formato ISSN-ANO-VOLUME
        :return: ISSN-L
        """
        issn, year, volume = valid_match.split('-')

        issnl = self.db['issn-to-issnl'].get(issn, '')

        if not issnl:
            issnl = issn

        return issnl

    def extract_issn_year_volume_keys(self, cit: Citation, issns: set):
        """
        Extrai chaves ISSN-YEAR-VOLUME para uma referência citada e lista de ISSNs.

        :param cit: referência citada
        :param issns: set de possíveis ISSNs
        :return: set de chaves ISSN-ANO-VOLUME
        """
        keys = set()

        cit_year = cit.publication_date

        if cit_year:
            if len(cit_year) > 4:
                cit_year = cit_year[:4]

            if len(cit_year) == 4 and cit_year.isdigit():
                cit_vol = cit.volume

                if cit_vol and cit_vol.isdigit():
                    for i in issns:
                        keys.add('-'.join([i, cit_year, cit_vol]))
                    return keys, VOLUME_IS_ORIGINAL
                else:
                    for i in issns:
                        cit_vol_inferred = self.infer_volume(i, cit_year)
                        if cit_vol_inferred:
                            keys.add('-'.join([i, cit_year, cit_vol_inferred]))
                    return keys, VOLUME_IS_INFERRED

        return keys, VOLUME_NOT_USED

    def get_issns(self, matched_issnls: set):
        """
        Obtém todos os ISSNs associados a um set de ISSN-Ls.

        :param matched_issnls: ISSN-Ls casados para uma dada referência citada
        :return: set de ISSNs vinculados aos ISSNL-s
        """
        possible_issns = set()

        for mi in matched_issnls:
            possible_issns = possible_issns.union(
                set([
                    j for j in self.db['issnl-to-data'].get(mi, {}).get(
                        'issns', [])
                ]))

        return possible_issns

    def get_status(self, match_mode: str, mount_mode: int, db_used: str):
        """
        Obtém o status com base no modo de casamento, de volume utilizado e de base de validação utilizada.

        :param match_mode: modo de casamento ['exact', 'fuzzy']
        :param mount_mode: modo de obtenção da chave de validação ['VOLUME_IS_ORIGINAL', VOLUME_IS_INFERRED']
        :param db_used: base de validação utilizada ['lr', 'lr-ml1', 'default']
        :return: código de status conforme método utilizado
        """
        if mount_mode == VOLUME_IS_ORIGINAL:
            if match_mode == 'exact':
                if db_used == 'lr':
                    return STATUS_EXACT_VALIDATED_LR
                elif db_used == 'lr-ml1':
                    return STATUS_EXACT_VALIDATED_LR_ML1
                elif db_used == 'default':
                    return STATUS_EXACT_VALIDATED
            else:
                if db_used == 'lr':
                    return STATUS_FUZZY_VALIDATED_LR
                elif db_used == 'lr-ml1':
                    return STATUS_FUZZY_VALIDATED_LR_ML1
                elif db_used == 'default':
                    return STATUS_FUZZY_VALIDATED
        elif mount_mode == VOLUME_IS_INFERRED:
            if match_mode == 'exact':
                if db_used == 'lr':
                    return STATUS_EXACT_VOLUME_INFERRED_VALIDATED_LR
                elif db_used == 'lr-ml1':
                    return STATUS_EXACT_VOLUME_INFERRED_VALIDATED_LR_ML1
                elif db_used == 'default':
                    return STATUS_EXACT_VOLUME_INFERRED_VALIDATED
            else:
                if db_used == 'lr':
                    return STATUS_FUZZY_VOLUME_INFERRED_VALIDATED_LR
                elif db_used == 'lr-ml1':
                    return STATUS_FUZZY_VOLUME_INFERRED_VALIDATED_LR_ML1
                elif db_used == 'default':
                    return STATUS_FUZZY_VOLUME_INFERRED_VALIDATED

    def infer_volume(self, issn: str, year: str):
        """
        Infere o volume de um periódico a partir de issn-to-equation.

        :param issn: issn para o qual o volume será inferido
        :return: str do volume inferido arredondado para valor inteiro (se volume inferido for maior que 0)
        """
        equation = self.db['issn-to-equation'].get(issn)

        if equation:
            a, b, r2 = equation
            volume = a + (b * int(year))

            if volume > 0:
                return str(round(volume))

    def match_exact(self, journal_title: str):
        """
        Procura journal_title de forma exata no dicionário title-to-issnl.

        :param journal_title: título do periódico citado
        :return: set de ISSN-Ls associados de modo exato ao título do periódico citado
        """
        return self.db['title-to-issnl'].get(journal_title, set())

    def match_fuzzy(self, journal_title: str):
        """
        Procura journal_title de forma aproximada no dicionário title-to-issnl.

        :param journal_title: título do periódico citado
        :return: set de ISSN-Ls associados de modo aproximado ao título do periódico citado
        """
        matches = set()

        words = journal_title.split(' ')

        # Para a comparação ser possível, é preciso que o título tenha pelo menos MIN_CHARS_LENGTH letras e seja
        # formado por pelo menos MIN_WORDS_COUNT palavras.
        if len(journal_title) > MIN_CHARS_LENGTH and len(
                words) >= MIN_WORDS_COUNT:
            # O título oficial deve iniciar com a primeira palavra do título procurado
            pattern = r'[\w|\s]*'.join([word for word in words]) + '[\w|\s]*'
            title_pattern = re.compile(pattern, re.UNICODE)

            # O título oficial deve iniciar com a primeira palavra do título procurado
            for official_title in [
                    ot for ot in self.db['title-to-issnl'].keys()
                    if ot.startswith(words[0])
            ]:
                if title_pattern.fullmatch(official_title):
                    matches = matches.union(
                        self.db['title-to-issnl'][official_title])
        return matches

    def mount_id(self, cit: Citation, collection: str):
        """
        Monta o identificador de uma referência citada.

        :param cit: referência citada
        :param collection: coleção em que a referência foi citada
        :return: código identificador da citação
        """
        cit_id = cit.data['v880'][0]['_']
        return '{0}-{1}'.format(cit_id, collection)

    def mount_standardized_citation_data(self,
                                         status: int,
                                         key=None,
                                         issn_l=None):
        """
        Consulta issn_l (oriundo de key ou de issn_l) no dicionário issnl-to-data para formar a estrutura normalizada da
        referencia citada. Monta estrutura normalizada da referencia citada, conforme os campos a seguir:

            cit-id: identificador da referência citada (str)

            issn-l: ISSN-Link do periódico citado (str)

            issns: ISSNs associados ao ISSN-L (list de strs)

            official-journal-title: títulos oficiais do periódico citado (str)

            official-abbreviated-journal-title: títulos abreviados oficiais do periódico citado (lista de str)

            alternative-journal-title: títulos alternativos do periódico citado (lista de str)

            status: código indicador do méetodo para normalizar

            update-date: data de normalização

        :param cit: referência citada
        :param status: código indicador do método aplicado para normalizar
        :param key: chave da qual o issn-l é extraído e buscado na base de correção
        :param issn_l: issn-l a ser buscado na base de correção
        :return: dicionário composto por pares chave-valor de dados normalizados
        """
        if not issn_l:
            issn_l = self.extract_issnl_from_valid_match(key)

        attrs = self.db['issnl-to-data'][issn_l]

        data = {
            'issn-l': self.add_hifen_issn(issn_l),
            'issn': [self.add_hifen_issn(i) for i in attrs['issns']],
            'official-journal-title': attrs['main-title'],
            'official-abbreviated-journal-title': attrs['main-abbrev-title'],
            'alternative-journal-titles': attrs['alternative-titles'],
            'status': status,
            'update-date': datetime.now().strftime('%Y-%m-%d')
        }

        return data

    def save_standardized_citations(self, std_citations: dict):
        """
        Persiste as referências citadas normalizadas.

        :param std_citations: dicionário de referências citadas normalizadas
        """
        if self.persist_mode == 'json':
            with open(self.path_results, 'a') as f:
                json.dump(std_citations, f)
                f.write('\n')

        elif self.persist_mode == 'mongo':
            for v in std_citations.values():
                self.standardizer.update_one(filter={'_id': v['_id']},
                                             update={'$set': v},
                                             upsert=True)

    def get_citation_mongo_status(self, cit_id: str):
        """
        Obtém o status atual de normalização da referência citada.

        :param cit_id: id da referência citada
        :return: status atual de normalização da referência citada
        """
        if self.persist_mode == 'mongo':
            cit_standardized = self.standardizer.find_one({'_id': cit_id})
            if cit_standardized:
                return cit_standardized.get('status', STATUS_NOT_NORMALIZED)
        return STATUS_NOT_NORMALIZED

    def validate_match(self, keys, use_lr=False, use_lr_ml1=False):
        """
        Valida chaves ISSN-ANO-VOLUME nas bases de validação
        :param keys: chaves em formato ISSN-ANO-VOLUME
        :param use_lr: valida com dados de regressão linear de ISSN-ANO-VOLUME
        :param use_lr_ml1: valida com dados de regressão linear de ISSN-ANO-VOLUME mais ou menos 1
        :return: chaves validadas
        """
        valid_matches = set()

        if use_lr:
            validating_base = self.db['issn-year-volume-lr']
        elif use_lr_ml1:
            validating_base = self.db['issn-year-volume-lr-ml1']
        else:
            validating_base = self.db['issn-year-volume']

        for k in keys:
            if k in validating_base:
                valid_matches.add(k)

        return valid_matches

    def _standardize(self, cit, cleaned_cit_journal_title, mode='exact'):
        """
        Processo auxiliar que realiza casamento de um título de periódico citado e valida casamentos, se houver
        mais de um. O processo de validação consiste em desambiguar os possíveis ISSN-Ls associados a um periódico
        citado usando dados de ano e volume da referência citada.

        :param cit: referência citada
        :param mode: mode de execução de casamento ['exact', 'fuzzy']
        :param cleaned_cit_journal_title: título limpo do periódico citado
        :return: dicionário composto por dados normalizados
        """
        if mode == 'fuzzy':
            matches = self.match_fuzzy(cleaned_cit_journal_title)
        else:
            matches = self.match_exact(cleaned_cit_journal_title)

        # Verifica se houve casamento com apenas com um ISSN-L e se é casamento exato
        if len(matches) == 1 and mode == 'exact':
            return self.mount_standardized_citation_data(status=STATUS_EXACT,
                                                         issn_l=matches.pop())

        # Verifica se houve casamento com mais de um ISSN-L ou se é casamento aproximado e houve apenas um casamento
        elif len(matches) > 1 or (mode == 'fuzzy' and len(matches)) == 1:
            # Carrega todos os ISSNs possiveis associados aos ISSN-Ls casados
            possible_issns = self.get_issns(matches)

            if possible_issns:
                # Monta chaves ISSN-ANO-VOLUME
                keys, mount_mode = self.extract_issn_year_volume_keys(
                    cit, possible_issns)

                if keys:
                    # Valida chaves na base de ano e volume
                    cit_valid_matches = self.validate_match(keys)

                    if len(cit_valid_matches) == 1:
                        status = self.get_status(mode, mount_mode, 'default')
                        return self.mount_standardized_citation_data(
                            status, cit_valid_matches.pop())

                    elif len(cit_valid_matches) == 0:
                        # Valida chaves na base de regressão linear
                        cit_valid_matches = self.validate_match(keys,
                                                                use_lr=True)

                        if len(cit_valid_matches) == 1:
                            status = self.get_status(mode, mount_mode, 'lr')
                            return self.mount_standardized_citation_data(
                                status, cit_valid_matches.pop())

                        elif len(cit_valid_matches) == 0:
                            # Valida chaves na base de regressão linear com volume flexibilizado
                            cit_valid_matches = self.validate_match(
                                keys, use_lr_ml1=True)

                            if len(cit_valid_matches) == 1:
                                status = self.get_status(
                                    mode, mount_mode, 'lr-ml1')
                                return self.mount_standardized_citation_data(
                                    status, cit_valid_matches.pop())

    def standardize(self, document):
        """
        Normaliza referências citadas de um artigo.
        Atua de duas formas: exata e aproximada.
        Persiste resultados em arquivo JSON ou em MongoDB.

        :param document: Article dos quais as referências citadas serão normalizadas
        """
        std_citations = {}

        if document.citations:
            for c, cit in enumerate([
                    dc for dc in document.citations
                    if dc.publication_type == 'article'
            ]):
                cit_id = self.mount_id(cit, document.collection_acronym)
                cit_current_status = self.get_citation_mongo_status(cit_id)

                if cit_current_status == STATUS_NOT_NORMALIZED:
                    cleaned_cit_journal_title = preprocess_journal_title(
                        cit.source)

                    if cleaned_cit_journal_title:

                        if self.use_exact:
                            exact_match_result = self._standardize(
                                cit, cleaned_cit_journal_title)
                            if exact_match_result:
                                exact_match_result.update({
                                    '_id':
                                    cit_id,
                                    'cited-journal-title':
                                    cleaned_cit_journal_title
                                })
                                std_citations[cit_id] = exact_match_result
                                cit_current_status = exact_match_result[
                                    'status']

                        if self.use_fuzzy:
                            if cit_current_status == STATUS_NOT_NORMALIZED:
                                fuzzy_match_result = self._standardize(
                                    cit,
                                    cleaned_cit_journal_title,
                                    mode='fuzzy')
                                if fuzzy_match_result:
                                    fuzzy_match_result.update({
                                        '_id':
                                        cit_id,
                                        'cited-journal-title':
                                        cleaned_cit_journal_title
                                    })
                                    std_citations[cit_id] = fuzzy_match_result
                                    cit_current_status = fuzzy_match_result[
                                        'status']

                        if cit_current_status == STATUS_NOT_NORMALIZED and (
                                self.use_exact or self.use_fuzzy):
                            unmatch_result = {
                                '_id': cit_id,
                                'cited-journal-title':
                                cleaned_cit_journal_title,
                                'status': STATUS_NOT_NORMALIZED,
                                'update-date':
                                datetime.now().strftime('%Y-%m-%d')
                            }
                            std_citations[cit_id] = unmatch_result

        if std_citations:
            self.save_standardized_citations(std_citations)
예제 #10
0
    # Create new threads
    threads = []
    threadID = 0
    for i in range(NR_OF_THREADS):
        contractCollection = MongoClient(MONGO_HOST,
                                         MONGO_PORT)[DATABASE][COLLECTION]
        thread = searchThread(threadID, contractQueue, contractCollection)
        thread.start()
        threads.append(thread)
        threadID += 1

    contractCollection = MongoClient(MONGO_HOST,
                                     MONGO_PORT)[DATABASE][COLLECTION]
    cursor = contractCollection.find()
    print("Total number of smart contracts: " +
          str(contractCollection.count_documents({})))

    uniques = set()
    contracts = []
    distinct_bytecode = {}
    distinct_deployer = {}
    for contract in cursor:
        if not contract["creator"] in distinct_deployer:
            distinct_deployer[contract["creator"]] = 1
        else:
            distinct_deployer[contract["creator"]] += 1
        if not contract["byteCode"].encode("utf-8") in uniques:
            uniques.add(contract["byteCode"].encode("utf-8"))
            contracts.append(contract)
            distinct_bytecode[contract["byteCode"].encode("utf-8")] = 1
        else:
	authSource='admin'
).local.oplog.rs

# convert current_date_int to timestamp
ts = Timestamp(datetime.utcnow() - timedelta(minutes=10), 1)

# query records for only delete ops with in last 30 mins
query = {
	'$and': [
		{ 'ts': { '$gte': ts } },
		{ 'op': { '$in': ['u'] } },
		{ 'ns': { '$regex': 'Cluster0.', '$options': 'i' } }
	]
}

if oplog.count_documents(query):
	cursor = oplog.find(query)
	for doc in cursor:
		pprint(doc)
		db, collection = doc['ns'].split('.')

		# connect to local server
		local_client = MongoClient(
			'127.0.0.1:27017',
			username='******',
			password='******',
			authSource='admin'
		)
		local_db = local_client[db][collection]

		updateQuery = { '_id':  doc['o2']['_id'] }
예제 #12
0
        text = data["text"]
        if user_id not in reviewsByUser:
            reviewsByUser[user_id] = []
        reviewsByUser[user_id].append(text)

for i, val in reviewsByUser.items():
    USER_D.insert_one({"USER_ID": i, "TEXT": val})

reviewsByBusiness = {}
with open(dataset_file) as dataset:
    next(dataset)
    for line in dataset:
        try:
            data = json.loads(line)

        except ValueError:
            print('Oops!')
        business_id = data["business_id"]
        text = data["text"]

        if business_id not in reviewsByBusiness:
            reviewsByBusiness[business_id] = []
        reviewsByBusiness[business_id].append(text)

for i, val in reviewsByBusiness.items():
    BUSINESS_D.insert_one({"BUSINESS_ID": i, "TEXT": val})

print("The Number of Unique users in the databse is",
      USER_D.count_documents({}))
print("The Number of Unique Businesses in the databse is",
      BUSINESS_D.count_documents({}))
예제 #13
0
class MongoToElasticsearch():
  def __init__(self, index=ES_INDEX, collection=MONGO_COL):
    self._es = Elasticsearch(ES_HOSTS)
    self._index = index
    self._col = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][collection]

    self._setup_index()

  def _setup_index(self):
    if not self._es.indices.exists(self._index):
      self._es.indices.create(
        index=self._index,
        body={
          'settings': {
            'index': {
              'refresh_interval': '1m'
            }
          },
          'mappings': {
            ES_TYPE: {
              'properties': {
                'meta': {
                  'properties': {
                    'location': {
                      'type': 'geo_point'
                    }
                  }
                }
              }
            }
          }
        }
      )

  def _transform(self, obj):
    action = {
      '_index': self._index,
      '_id': str(obj['_id']),
      '_type': ES_TYPE
    }
    del obj['_id']

    if obj[ES_STATE] == 'delete':
      action['_op_type'] = 'delete'
    del obj[ES_STATE]

    action['_source'] = obj

    return action

  def _insert_batch(self, batch):
    mongo_batch = []

    for ok, result in helpers.parallel_bulk(self._es, batch):
      action, result = result.popitem()
      oid = ObjectId(result['_id'])

      if ok:
        mongo_update = UpdateOne(
          {'_id': oid},
          {'$set': {ES_STATE: 'complete'}}
        )
        mongo_batch.append(mongo_update)
      else:
        mongo_update = UpdateOne(
          {'_id': oid},
          {'$set': {ES_STATE: 'error'}}
        )
        mongo_batch.append(mongo_update)
        print('Failed to %s: %s', action, result['_id'])

    self._col.bulk_write(mongo_batch)

  def run(self):
    batch = []
    query = {
      '$or': [
        {ES_STATE: 'insert'},
        {ES_STATE: 'update'},
        {ES_STATE: 'remove'}
      ]
    }

    with tqdm(total=self._col.count_documents(query)) as pbar:
      for obj in self._col.find(query):
        batch.append(self._transform(obj))

        if len(batch) == BATCH_SIZE:
          self._insert_batch(batch)
          batch = []
          pbar.update(BATCH_SIZE)

    # Flush remaining
    self._insert_batch(batch)