def parse_semantic_scholar_corpus_file(path, database_path="aip.db"): database = DatabaseManager(location=database_path) hash, parsed = database.did_parse_file(path) if parsed: return True file_iterator_func = iterload_file_lines_gzip if path.endswith("gz") else iterload_file_lines # print(corpus_file) # The json files contain stacked json objects, which is bad practice. It should be wrapped in a JSON array. # Libraries will throw errors if you attempt to load the file, so now we lazy load each object line by line. publication_iterator = file_iterator_func(path) for publication in publication_iterator: if publication is None: # Corrupt JSON line possibly. Skip it. continue if "venue" not in publication: # While parsing we sometimes get KeyError: 'venue'... continue # Try to match the publication to a venue we are interested in. # Wrap in str() as it sometimes is an int (???) venue_string = str(publication['venue']) if len(venue_string) == 0: continue # Check if any of the venue strings are a substring of the mentioned value, add it to that set. publication_title = publication['title'] publication_abstract = publication['paperAbstract'] publication_year = publication['year'] if 'year' in publication else -1 publication_journal_volume = publication['journalVolume'].replace(" ", "_") # Empty for conferences. # publication_keywords = publication['entities'] publication_id = publication['id'] num_citations = 0 if "inCitations" in publication: num_citations = len(publication["inCitations"]) publication_doi = publication['doi'] if publication_doi is None or len(publication_doi) == 0: publication_doi_url = publication['doiUrl'] if "doi.org/" in publication_doi_url: publication_doi = publication['doiUrl'][ publication['doiUrl'].index("doi.org/") + len("doi.org/"):] database.update_or_insert_paper(id=publication_id, doi=publication_doi, title=publication_title, abstract=publication_abstract, raw_venue_string=venue_string, year=publication_year, volume=publication_journal_volume, num_citations=num_citations) # database.flush_missing_venues() database.add_parsed_file(hash) database.close() return True
class DataHandler: # Creates new data handler, connects to database using username and password saved in config file. def __init__(self): config_path = os.path.join(os.path.abspath(""), "config", "database.config") db_storage = os.path.join(os.path.abspath(""), "db_storage") with open(config_path) as config: user = config.readline().strip() pwd = config.readline().strip() self.database = DatabaseManager(user, pwd, "nonogramDB", db_storage) # Handles queries according to their type. Returns response to each query. def process(self, data): try: query = loads(data, encoding="UTF-8") Logger.writeInfo("Query type: {}".format(query["type"])) if query["type"] == "saveNonogram": return self.createResponse( self.database.saveNonogram(query["data"])) if query["type"] == "getNonogramPreviewInfo": return self.createResponse( self.database.getNonogramPreviewInfo(query["data"])) if query["type"] == "getNonogramById": return self.createResponse( self.database.getNonogramById(query["data"])) if query["type"] == "solveNonogram": return self.createResponse(NonogramLogic.solve(query["data"])) if query["type"] == "createNonogram": return self.createResponse(NonogramLogic.create(query["data"])) Logger.writeError("Method " + query["type"] + " does not exist") return self.createResponse({ "response": "fail", "desc": "Method '{}' not implemented yet".format(query["type"]) }) except (JSONDecodeError, KeyError) as e: Logger.writeError("DataHandler.process: " + str(e)) return self.createResponse({"response": "fail", "desc": str(e)}) # Packs size of response object and inserts it at the beginning of response string, creating the response. def createResponse(self, obj): msg = dumps(obj, separators=(',', ':')).encode() length = pack("<i", len(msg)) Logger.writeInfo("Created response") return length + msg def close(self): self.database.close()
def test_fetchone(self): """Test for fetching one record.""" spatialite_path = sg_diagrams_database db_manager = DatabaseManager(spatialite_path) query = "SELECT province FROM provinces WHERE " query += "Within(GeomFromText('POINT(25 -30)'), Geometry)" result = db_manager.fetch_one(query) expected_result = ('Free State',) message = 'Expected %s, got %s' % (expected_result, result) self.assertEqual(result, expected_result, message) query = "SELECT province FROM provinces WHERE " query += "Within(GeomFromText('POINT(100 100)'), Geometry)" result = db_manager.fetch_one(query) expected_result = None message = 'Expected %s, got %s' % (expected_result, result) self.assertEqual(result, expected_result, message) db_manager.close()
def add_semantic_scholar_cites_data(path, database_path="aip"): database = DatabaseManager(location=database_path) file_iterator_func = iterload_file_lines_gzip if path.endswith( "gz") else iterload_file_lines publication_iterator = file_iterator_func(path) for publication in publication_iterator: publication_id = publication['id'] in_citations = [] out_citations = [] if "inCitations" in publication: in_citations = publication["inCitations"] if "outCitations" in publication: out_citations = publication["outCitations"] database.insert_cites(publication_id, in_citations, out_citations) # TODO: add hashing of the file so that is doesn't re compute already # computed files in case of multiple restarts?? database.close() return True
def test_connection(self): spatialite_path = sg_diagrams_database db_manager = DatabaseManager(spatialite_path) query = 'SELECT count(*) FROM provinces' result = db_manager.fetch_one(query) expected_result = (7,) message = 'Expected %s, got %s' % (expected_result, result) self.assertEqual(result, expected_result, message) db_manager.close() spatialite_path = sg_diagrams_database + 'zero' db_manager = DatabaseManager(spatialite_path) query = 'SELECT count(*) FROM provinces' self.assertRaises(DatabaseException, db_manager.fetch_one, query) db_manager.close() if os.path.exists(spatialite_path): os.remove(spatialite_path)
def parse_aminer_corpus_file(path, database_path="aip", logger_disabled=False): logger.disabled = logger_disabled database = DatabaseManager(location=database_path) hash, parsed = database.did_parse_file(path) if parsed: return True # print(path) # The json files contain stacked json objects, which is bad practice. # It should be wrapped in a JSON array. # Libraries will throw errors if you attempt to load the file, so now we lazy load each object. file_iterator_func = iterload_file_lines_gzip if path.endswith( "gz") else iterload_file_lines publication_iterator = file_iterator_func(path) for publication in tqdm(publication_iterator): if publication is None: # Corrupt JSON line possibly. Skip it. continue # Try to match the publication to a venue we are interested in. # Warning: contrary to the documentation, the key is "venue" NOT "venue.raw"! if 'venue' not in publication: logger.warning("Skipping line missing venue: %s in %s.", publication, path) continue if 'title' not in publication: logger.warning("Skipping line missing title: %s in %s.", publication, path) continue venue_string = publication['venue'] # Sometimes the venue string is yet another dict... if isinstance(venue_string, dict) and "raw" in venue_string: venue_string = venue_string["raw"] publication_title = str(publication['title']).rstrip(".") publication_abstract = publication[ 'abstract'] if 'abstract' in publication else "" publication_year = publication[ 'year'] if 'year' in publication else None publication_journal_volume = publication[ 'volume'] if 'volume' in publication else None # publication_keywords = publication['keywords'] publication_id = publication['id'] # citation_count = int(publication['n_citation']) if "n_citation" in publication else None publication_doi = publication['doi'] if 'doi' in publication else None # Sometimes in the urls, a doi link is used. If there is, we attempt to extract the doi from the link. if publication_doi is None or len(publication_doi) == 0: publication_doi_urls = publication[ 'url'] if 'url' in publication else [] for publication_doi_url in publication_doi_urls: if "doi.org/" in publication_doi_url: publication_doi = publication_doi_url[publication_doi_url. index("doi.org/") + len("doi.org/"):] break database.update_or_insert_paper(id=publication_id, doi=publication_doi, title=publication_title, abstract=publication_abstract, raw_venue_string=venue_string, year=publication_year, volume=publication_journal_volume) # database.flush_missing_venues() database.add_parsed_file(hash) database.close() return True
def parse(dblp_file, database_path="aip.db"): database = DatabaseManager(location=database_path) hash, parsed = database.did_parse_file(dblp_file) if parsed: return True counter = 0 # counter for new keys. # dtd = etree.DTD(file="/media/lfdversluis/datastore/dblp.dtd") for event, element in etree.iterparse(dblp_file, load_dtd=True, dtd_validation=True): if element.tag not in ['article', 'inproceedings', 'proceedings']: continue if 'key' in element.attrib: id = str(element.attrib['key']) else: id = "id" + str(counter) counter += 1 title = element.find('title') # type: Optional[str] if title is not None: title = str(title.text).rstrip(".") year = element.find('year') # type: Optional[int] if year is not None: try: year = int(re.search(r'\d+', str(year.text)).group()) if 20 < year < 100: # Weird cases like 92-93 year += 1900 elif year < 20: # weird cases like '12 year += 2000 except: year = None volume = element.find('volume') # type: Optional[int] if volume is not None: try: volume = int(volume.text) except: volume = None # authors = element.find('author') # type: Optional[str] venue = element.find('booktitle') # type: Optional[str] if venue is None and len(element.findall('journal')) > 0: venue = element.find('journal') if venue is not None and venue.text is not None: venue = str(venue.text) else: venue = None doi = None for ee in element.findall('ee'): ee_str = str(ee.text) if ee is not None and "doi.org" in ee_str: doi = ee_str[ee_str.index("doi.org/") + len("doi.org/"):] break if title is not None and year is not None and venue is not None: # Clean the title which may have HTML elements database.update_or_insert_paper(id=id, doi=doi, title=title, abstract="", raw_venue_string=venue, year=year, volume=volume, num_citations=-1) # Get the authors for this paper and add them to the database authors = [] # tuples of ID, orcid for author_element in element.findall('author'): orcid = None if "orcid" in author_element.attrib: orcid = str(author_element.attrib['orcid']) authors.append((author_element.text, orcid)) database.add_authors_for_article(authors=authors, article_id=id) element.clear() # database.flush_missing_venues() database.add_parsed_file(hash) database.close() return True
def canvasReleaseEvent(self, event): """Slot called when the mouse button is released on the canvas. :param event: Canvas event containing position of click, which button was clicked etc. """ if not event.button() == Qt.LeftButton: return def progress_callback(current, maximum, message=None): """GUI based callback implementation for showing progress. :param current: Current progress. :type current: int :param maximum: Maximum range (point at which task is complete. :type maximum: int :param message: Optional message to display in the progress bar :type message: str, QString """ if message is not None: self.message_bar.setText(message) if self.progress_bar is not None: self.progress_bar.setMaximum(maximum) self.progress_bar.setValue(current) self.iface.messageBar().pushMessage(self.tr('SG Downloader.'), self.tr('Preparing for download'), level=QgsMessageBar.INFO, duration=10) # No need to check that it is a valid, polygon layer # as the QAction for this map tool already does that layer = self.canvas.currentLayer() place = self.toMapCoordinates(event.pos()) rectangle = point_to_rectangle(place) request = QgsFeatureRequest(QgsFeatureRequest.FilterRect) # Ensure only those features really intersecting the rect are returned request.setFlags(QgsFeatureRequest.ExactIntersect) request.setFilterRect(rectangle) polygons = layer.getFeatures(request) feature = QgsFeature() fetch_list = [] all_fields = layer.pendingFields() text_fields = [] # Ignore any columns that don't contain text data for field in all_fields: if field.typeName() == 'String' or field.typeName() == 'Text': text_fields.append(field) self.setup_message_bar() sg_field = None while polygons.nextFeature(feature): # geom = feature.geometry() # attributes = feature.attributes() # matched = False # sg_code = None if sg_field is None: for field in text_fields: value = str(feature[field.name()]) if not is_valid_sg_code(value): continue sg_field = field.name() fetch_list.append(value) else: # We already know which column has SG codes value = str(feature[sg_field]) fetch_list.append(value) if len(fetch_list) == 0: self.iface.messageBar().pushMessage( self.tr('SG Downloader.'), self.tr('No parcels found with a valid 21 Digit code'), level=QgsMessageBar.WARNING, duration=10) return province = province_for_point(self.db_manager, place) report = '' sg_diagrams_database = os.path.join(DATA_DIR, 'sg_diagrams.sqlite') data_manager = DatabaseManager(sg_diagrams_database) i = 0 for sg_code in fetch_list: i += 1 message = 'Downloading SG Code %s from %s' % (sg_code, province) progress_callback(i, len(fetch_list), message) report += download_sg_diagram(data_manager, sg_code, province, self.output_directory, callback=progress_callback) data_manager.close() try: write_log(report, self.log_file) except IOError as e: print e self.show_log(report, self.log_file)
def canvasReleaseEvent(self, event): """Slot called when the mouse button is released on the canvas. :param event: Canvas event containing position of click, which button was clicked etc. """ if not event.button() == Qt.LeftButton: return def progress_callback(current, maximum, message=None): """GUI based callback implementation for showing progress. :param current: Current progress. :type current: int :param maximum: Maximum range (point at which task is complete. :type maximum: int :param message: Optional message to display in the progress bar :type message: str, QString """ if message is not None: self.message_bar.setText(message) if self.progress_bar is not None: self.progress_bar.setMaximum(maximum) self.progress_bar.setValue(current) self.iface.messageBar().pushMessage( self.tr('SG Downloader.'), self.tr('Preparing for download'), level=QgsMessageBar.INFO, duration=10) # No need to check that it is a valid, polygon layer # as the QAction for this map tool already does that layer = self.canvas.currentLayer() place = self.toMapCoordinates(event.pos()) rectangle = point_to_rectangle(place) request = QgsFeatureRequest(QgsFeatureRequest.FilterRect) # Ensure only those features really intersecting the rect are returned request.setFlags(QgsFeatureRequest.ExactIntersect) request.setFilterRect(rectangle) polygons = layer.getFeatures(request) feature = QgsFeature() fetch_list = [] all_fields = layer.pendingFields() text_fields = [] # Ignore any columns that don't contain text data for field in all_fields: if field.typeName() == 'String' or field.typeName() == 'Text': text_fields.append(field) self.setup_message_bar() sg_field = None while polygons.nextFeature(feature): # geom = feature.geometry() # attributes = feature.attributes() # matched = False # sg_code = None if sg_field is None: for field in text_fields: value = str(feature[field.name()]) if not is_valid_sg_code(value): continue sg_field = field.name() fetch_list.append(value) else: # We already know which column has SG codes value = str(feature[sg_field]) fetch_list.append(value) if len(fetch_list) == 0: self.iface.messageBar().pushMessage( self.tr('SG Downloader.'), self.tr('No parcels found with a valid 21 Digit code'), level=QgsMessageBar.WARNING, duration=10) return province = province_for_point(self.db_manager, place) report = '' sg_diagrams_database = os.path.join(DATA_DIR, 'sg_diagrams.sqlite') data_manager = DatabaseManager(sg_diagrams_database) i = 0 for sg_code in fetch_list: i += 1 message = 'Downloading SG Code %s from %s' % (sg_code, province) progress_callback(i, len(fetch_list), message) report += download_sg_diagram( data_manager, sg_code, province, self.output_directory, callback=progress_callback) data_manager.close() try: write_log(report, self.log_file) except IOError as e: print e self.show_log(report, self.log_file)