Exemplo n.º 1
0
def parse_semantic_scholar_corpus_file(path, database_path="aip.db"):
        database = DatabaseManager(location=database_path)

        hash, parsed = database.did_parse_file(path)
        if parsed:
            return True

        file_iterator_func = iterload_file_lines_gzip if path.endswith("gz") else iterload_file_lines
        # print(corpus_file)
        # The json files contain stacked json objects, which is bad practice. It should be wrapped in a JSON array.
        # Libraries will throw errors if you attempt to load the file, so now we lazy load each object line by line.
        publication_iterator = file_iterator_func(path)
        for publication in publication_iterator:
            if publication is None:  # Corrupt JSON line possibly. Skip it.
                continue

            if "venue" not in publication:  # While parsing we sometimes get KeyError: 'venue'...
                continue

            # Try to match the publication to a venue we are interested in.
            # Wrap in str() as it sometimes is an int (???)
            venue_string = str(publication['venue'])
            if len(venue_string) == 0:
                continue

            # Check if any of the venue strings are a substring of the mentioned value, add it to that set.
            publication_title = publication['title']
            publication_abstract = publication['paperAbstract']
            publication_year = publication['year'] if 'year' in publication else -1
            publication_journal_volume = publication['journalVolume'].replace(" ",
                                                                              "_")  # Empty for conferences.
            # publication_keywords = publication['entities']
            publication_id = publication['id']

            num_citations = 0
            if "inCitations" in publication:
                num_citations = len(publication["inCitations"])

            publication_doi = publication['doi']
            if publication_doi is None or len(publication_doi) == 0:
                publication_doi_url = publication['doiUrl']
                if "doi.org/" in publication_doi_url:
                    publication_doi = publication['doiUrl'][
                                      publication['doiUrl'].index("doi.org/") + len("doi.org/"):]

            database.update_or_insert_paper(id=publication_id, doi=publication_doi, title=publication_title,
                                            abstract=publication_abstract, raw_venue_string=venue_string,
                                            year=publication_year, volume=publication_journal_volume,
                                            num_citations=num_citations)
        # database.flush_missing_venues()
        database.add_parsed_file(hash)
        database.close()
        return True
class DataHandler:

    # Creates new data handler, connects to database using username and password saved in config file.
    def __init__(self):
        config_path = os.path.join(os.path.abspath(""), "config",
                                   "database.config")
        db_storage = os.path.join(os.path.abspath(""), "db_storage")
        with open(config_path) as config:
            user = config.readline().strip()
            pwd = config.readline().strip()
        self.database = DatabaseManager(user, pwd, "nonogramDB", db_storage)

    # Handles queries according to their type. Returns response to each query.
    def process(self, data):
        try:
            query = loads(data, encoding="UTF-8")
            Logger.writeInfo("Query type: {}".format(query["type"]))
            if query["type"] == "saveNonogram":
                return self.createResponse(
                    self.database.saveNonogram(query["data"]))
            if query["type"] == "getNonogramPreviewInfo":
                return self.createResponse(
                    self.database.getNonogramPreviewInfo(query["data"]))
            if query["type"] == "getNonogramById":
                return self.createResponse(
                    self.database.getNonogramById(query["data"]))
            if query["type"] == "solveNonogram":
                return self.createResponse(NonogramLogic.solve(query["data"]))
            if query["type"] == "createNonogram":
                return self.createResponse(NonogramLogic.create(query["data"]))
            Logger.writeError("Method " + query["type"] + " does not exist")
            return self.createResponse({
                "response":
                "fail",
                "desc":
                "Method '{}' not implemented yet".format(query["type"])
            })

        except (JSONDecodeError, KeyError) as e:
            Logger.writeError("DataHandler.process: " + str(e))
            return self.createResponse({"response": "fail", "desc": str(e)})

    # Packs size of response object and inserts it at the beginning of response string, creating the response.
    def createResponse(self, obj):
        msg = dumps(obj, separators=(',', ':')).encode()
        length = pack("<i", len(msg))
        Logger.writeInfo("Created response")
        return length + msg

    def close(self):
        self.database.close()
    def test_fetchone(self):
        """Test for fetching one record."""
        spatialite_path = sg_diagrams_database
        db_manager = DatabaseManager(spatialite_path)

        query = "SELECT province FROM provinces WHERE "
        query += "Within(GeomFromText('POINT(25 -30)'), Geometry)"

        result = db_manager.fetch_one(query)
        expected_result = ('Free State',)
        message = 'Expected %s, got %s' % (expected_result, result)
        self.assertEqual(result, expected_result, message)

        query = "SELECT province FROM provinces WHERE "
        query += "Within(GeomFromText('POINT(100 100)'), Geometry)"

        result = db_manager.fetch_one(query)
        expected_result = None
        message = 'Expected %s, got %s' % (expected_result, result)
        self.assertEqual(result, expected_result, message)
        db_manager.close()
def add_semantic_scholar_cites_data(path, database_path="aip"):
    database = DatabaseManager(location=database_path)
    file_iterator_func = iterload_file_lines_gzip if path.endswith(
        "gz") else iterload_file_lines
    publication_iterator = file_iterator_func(path)

    for publication in publication_iterator:
        publication_id = publication['id']
        in_citations = []
        out_citations = []

        if "inCitations" in publication:
            in_citations = publication["inCitations"]

        if "outCitations" in publication:
            out_citations = publication["outCitations"]

        database.insert_cites(publication_id, in_citations, out_citations)

    # TODO: add hashing of the file so that is doesn't re compute already
    #  computed files in case of multiple restarts??
    database.close()
    return True
    def test_connection(self):
        spatialite_path = sg_diagrams_database
        db_manager = DatabaseManager(spatialite_path)

        query = 'SELECT count(*) FROM provinces'
        result = db_manager.fetch_one(query)

        expected_result = (7,)
        message = 'Expected %s, got %s' % (expected_result, result)
        self.assertEqual(result, expected_result, message)

        db_manager.close()

        spatialite_path = sg_diagrams_database + 'zero'
        db_manager = DatabaseManager(spatialite_path)

        query = 'SELECT count(*) FROM provinces'

        self.assertRaises(DatabaseException, db_manager.fetch_one, query)

        db_manager.close()

        if os.path.exists(spatialite_path):
            os.remove(spatialite_path)
Exemplo n.º 6
0
def parse_aminer_corpus_file(path, database_path="aip", logger_disabled=False):
    logger.disabled = logger_disabled
    database = DatabaseManager(location=database_path)

    hash, parsed = database.did_parse_file(path)
    if parsed:
        return True

    # print(path)
    # The json files contain stacked json objects, which is bad practice.
    # It should be wrapped in a JSON array.
    # Libraries will throw errors if you attempt to load the file, so now we lazy load each object.
    file_iterator_func = iterload_file_lines_gzip if path.endswith(
        "gz") else iterload_file_lines
    publication_iterator = file_iterator_func(path)
    for publication in tqdm(publication_iterator):
        if publication is None:  # Corrupt JSON line possibly. Skip it.
            continue

        # Try to match the publication to a venue we are interested in.
        # Warning: contrary to the documentation, the key is "venue" NOT "venue.raw"!
        if 'venue' not in publication:
            logger.warning("Skipping line missing venue: %s in %s.",
                           publication, path)
            continue

        if 'title' not in publication:
            logger.warning("Skipping line missing title: %s in %s.",
                           publication, path)
            continue

        venue_string = publication['venue']

        # Sometimes the venue string is yet another dict...
        if isinstance(venue_string, dict) and "raw" in venue_string:
            venue_string = venue_string["raw"]

        publication_title = str(publication['title']).rstrip(".")
        publication_abstract = publication[
            'abstract'] if 'abstract' in publication else ""

        publication_year = publication[
            'year'] if 'year' in publication else None
        publication_journal_volume = publication[
            'volume'] if 'volume' in publication else None
        # publication_keywords = publication['keywords']
        publication_id = publication['id']
        # citation_count = int(publication['n_citation']) if "n_citation" in publication else None

        publication_doi = publication['doi'] if 'doi' in publication else None
        # Sometimes in the urls, a doi link is used. If there is, we attempt to extract the doi from the link.
        if publication_doi is None or len(publication_doi) == 0:
            publication_doi_urls = publication[
                'url'] if 'url' in publication else []
            for publication_doi_url in publication_doi_urls:
                if "doi.org/" in publication_doi_url:
                    publication_doi = publication_doi_url[publication_doi_url.
                                                          index("doi.org/") +
                                                          len("doi.org/"):]
                    break

        database.update_or_insert_paper(id=publication_id,
                                        doi=publication_doi,
                                        title=publication_title,
                                        abstract=publication_abstract,
                                        raw_venue_string=venue_string,
                                        year=publication_year,
                                        volume=publication_journal_volume)
    # database.flush_missing_venues()
    database.add_parsed_file(hash)
    database.close()
    return True
Exemplo n.º 7
0
def parse(dblp_file, database_path="aip.db"):
    database = DatabaseManager(location=database_path)

    hash, parsed = database.did_parse_file(dblp_file)
    if parsed:
        return True

    counter = 0  # counter for new keys.

    # dtd = etree.DTD(file="/media/lfdversluis/datastore/dblp.dtd")
    for event, element in etree.iterparse(dblp_file,
                                          load_dtd=True,
                                          dtd_validation=True):
        if element.tag not in ['article', 'inproceedings', 'proceedings']:
            continue

        if 'key' in element.attrib:
            id = str(element.attrib['key'])
        else:
            id = "id" + str(counter)
            counter += 1
        title = element.find('title')  # type: Optional[str]
        if title is not None:
            title = str(title.text).rstrip(".")
        year = element.find('year')  # type: Optional[int]
        if year is not None:
            try:
                year = int(re.search(r'\d+', str(year.text)).group())
                if 20 < year < 100:  # Weird cases like 92-93
                    year += 1900
                elif year < 20:  # weird cases like '12
                    year += 2000
            except:
                year = None
        volume = element.find('volume')  # type: Optional[int]
        if volume is not None:
            try:
                volume = int(volume.text)
            except:
                volume = None
        # authors = element.find('author')  # type: Optional[str]
        venue = element.find('booktitle')  # type: Optional[str]
        if venue is None and len(element.findall('journal')) > 0:
            venue = element.find('journal')

        if venue is not None and venue.text is not None:
            venue = str(venue.text)
        else:
            venue = None

        doi = None
        for ee in element.findall('ee'):
            ee_str = str(ee.text)
            if ee is not None and "doi.org" in ee_str:
                doi = ee_str[ee_str.index("doi.org/") + len("doi.org/"):]
                break

        if title is not None and year is not None and venue is not None:
            # Clean the title which may have HTML elements
            database.update_or_insert_paper(id=id,
                                            doi=doi,
                                            title=title,
                                            abstract="",
                                            raw_venue_string=venue,
                                            year=year,
                                            volume=volume,
                                            num_citations=-1)

            # Get the authors for this paper and add them to the database
            authors = []  # tuples of ID, orcid
            for author_element in element.findall('author'):
                orcid = None
                if "orcid" in author_element.attrib:
                    orcid = str(author_element.attrib['orcid'])

                authors.append((author_element.text, orcid))

            database.add_authors_for_article(authors=authors, article_id=id)

        element.clear()

        # database.flush_missing_venues()
    database.add_parsed_file(hash)
    database.close()
    return True
    def canvasReleaseEvent(self, event):
        """Slot called when the mouse button is released on the canvas.

        :param event: Canvas event containing position of click, which button
            was clicked etc.
        """
        if not event.button() == Qt.LeftButton:
            return

        def progress_callback(current, maximum, message=None):
            """GUI based callback implementation for showing progress.

            :param current: Current progress.
            :type current: int

            :param maximum: Maximum range (point at which task is complete.
            :type maximum: int

            :param message: Optional message to display in the progress bar
            :type message: str, QString
            """
            if message is not None:
                self.message_bar.setText(message)
            if self.progress_bar is not None:
                self.progress_bar.setMaximum(maximum)
                self.progress_bar.setValue(current)

        self.iface.messageBar().pushMessage(self.tr('SG Downloader.'),
                                            self.tr('Preparing for download'),
                                            level=QgsMessageBar.INFO,
                                            duration=10)

        # No need to check that it is a valid, polygon layer
        # as the QAction for this map tool already does that
        layer = self.canvas.currentLayer()

        place = self.toMapCoordinates(event.pos())
        rectangle = point_to_rectangle(place)

        request = QgsFeatureRequest(QgsFeatureRequest.FilterRect)
        # Ensure only those features really intersecting the rect are returned
        request.setFlags(QgsFeatureRequest.ExactIntersect)
        request.setFilterRect(rectangle)
        polygons = layer.getFeatures(request)
        feature = QgsFeature()
        fetch_list = []
        all_fields = layer.pendingFields()
        text_fields = []
        # Ignore any columns that don't contain text data
        for field in all_fields:
            if field.typeName() == 'String' or field.typeName() == 'Text':
                text_fields.append(field)

        self.setup_message_bar()
        sg_field = None
        while polygons.nextFeature(feature):
            # geom = feature.geometry()
            # attributes = feature.attributes()
            # matched = False
            # sg_code = None
            if sg_field is None:
                for field in text_fields:
                    value = str(feature[field.name()])
                    if not is_valid_sg_code(value):
                        continue
                    sg_field = field.name()
                    fetch_list.append(value)
            else:
                # We already know which column has SG codes
                value = str(feature[sg_field])
                fetch_list.append(value)
        if len(fetch_list) == 0:
            self.iface.messageBar().pushMessage(
                self.tr('SG Downloader.'),
                self.tr('No parcels found with a valid 21 Digit code'),
                level=QgsMessageBar.WARNING,
                duration=10)
            return

        province = province_for_point(self.db_manager, place)

        report = ''
        sg_diagrams_database = os.path.join(DATA_DIR, 'sg_diagrams.sqlite')
        data_manager = DatabaseManager(sg_diagrams_database)

        i = 0
        for sg_code in fetch_list:
            i += 1
            message = 'Downloading SG Code %s from %s' % (sg_code, province)
            progress_callback(i, len(fetch_list), message)
            report += download_sg_diagram(data_manager,
                                          sg_code,
                                          province,
                                          self.output_directory,
                                          callback=progress_callback)
        data_manager.close()

        try:
            write_log(report, self.log_file)
        except IOError as e:
            print e

        self.show_log(report, self.log_file)
Exemplo n.º 9
0
    def canvasReleaseEvent(self, event):
        """Slot called when the mouse button is released on the canvas.

        :param event: Canvas event containing position of click, which button
            was clicked etc.
        """
        if not event.button() == Qt.LeftButton:
            return

        def progress_callback(current, maximum, message=None):
            """GUI based callback implementation for showing progress.

            :param current: Current progress.
            :type current: int

            :param maximum: Maximum range (point at which task is complete.
            :type maximum: int

            :param message: Optional message to display in the progress bar
            :type message: str, QString
            """
            if message is not None:
                self.message_bar.setText(message)
            if self.progress_bar is not None:
                self.progress_bar.setMaximum(maximum)
                self.progress_bar.setValue(current)

        self.iface.messageBar().pushMessage(
            self.tr('SG Downloader.'),
            self.tr('Preparing for download'),
            level=QgsMessageBar.INFO,
            duration=10)

        # No need to check that it is a valid, polygon layer
        # as the QAction for this map tool already does that
        layer = self.canvas.currentLayer()

        place = self.toMapCoordinates(event.pos())
        rectangle = point_to_rectangle(place)

        request = QgsFeatureRequest(QgsFeatureRequest.FilterRect)
        # Ensure only those features really intersecting the rect are returned
        request.setFlags(QgsFeatureRequest.ExactIntersect)
        request.setFilterRect(rectangle)
        polygons = layer.getFeatures(request)
        feature = QgsFeature()
        fetch_list = []
        all_fields = layer.pendingFields()
        text_fields = []
        # Ignore any columns that don't contain text data
        for field in all_fields:
            if field.typeName() == 'String' or field.typeName() == 'Text':
                text_fields.append(field)

        self.setup_message_bar()
        sg_field = None
        while polygons.nextFeature(feature):
            # geom = feature.geometry()
            # attributes = feature.attributes()
            # matched = False
            # sg_code = None
            if sg_field is None:
                for field in text_fields:
                    value = str(feature[field.name()])
                    if not is_valid_sg_code(value):
                        continue
                    sg_field = field.name()
                    fetch_list.append(value)
            else:
                # We already know which column has SG codes
                value = str(feature[sg_field])
                fetch_list.append(value)
        if len(fetch_list) == 0:
            self.iface.messageBar().pushMessage(
                self.tr('SG Downloader.'),
                self.tr('No parcels found with a valid 21 Digit code'),
                level=QgsMessageBar.WARNING,
                duration=10)
            return

        province = province_for_point(self.db_manager, place)

        report = ''
        sg_diagrams_database = os.path.join(DATA_DIR, 'sg_diagrams.sqlite')
        data_manager = DatabaseManager(sg_diagrams_database)

        i = 0
        for sg_code in fetch_list:
            i += 1
            message = 'Downloading SG Code %s from %s' % (sg_code, province)
            progress_callback(i, len(fetch_list), message)
            report += download_sg_diagram(
                data_manager,
                sg_code,
                province,
                self.output_directory,
                callback=progress_callback)
        data_manager.close()

        try:
            write_log(report, self.log_file)
        except IOError as e:
            print e

        self.show_log(report, self.log_file)