def from_json_to_db(self): thread = '' db = Base("scnscraper/abap.pydb", save_to_file= True) # create new base with field names db.create('url', 'uid', 'type', 'author', 'title', 'date_time', 'tags', 'views', 'answers', 'resolve', 'upvotes', 'text', mode='override') i=0 with open('scnsraper/threads.json', 'r') as file: for line in file: if(line.endswith(" }\n")): thread += line tokens = re.search(r"url:\s'(.*?)',\suid:\s'(.*?)',\stype:\s'(.*?)',\sauthor:\s'(.*?)',\stitle:\s'(.*?)',\sdate_time:\s'(.*?)',\stags:\s'(.*?)',\sviews:\s'(.*?)',\sanswers:\s'(.*?)',\sresolve:\s'(.*?)',\supvotes:\s'(.*?)', text:\s'((.|\n)*)'\s}", str(thread)) if tokens is not None: db.insert(url = tokens.group(1), uid = tokens.group(2), type= tokens.group(3), author=tokens.group(4), title = tokens.group(5), date_time = tokens.group(6), tags = tokens.group(7), views = tokens.group(8), answers = tokens.group(9), resolve = tokens.group(10), upvotes = tokens.group(11), text = tokens.group(12)) db.commit() print ('\n--------------------------------------------\n') thread = '' if(line.startswith(" ]")): print("new page") thread = '' if(line.endswith('\n') and (not line.startswith(" ]\n\n")) and (not line.endswith(" }\n"))): thread += line
def detectduplicates(classdir): # Create an in-memory database db = Base('fingerprinter', save_to_file=False) db.create('filename', 'hash') filecount = len(glob(classdir + "/*.jpg")) duplicatecount = 0 print("creating image fingerprints for de-duplication ...") index = 0 for imagePath in glob(classdir + "/*.jpg"): index = index + 1 try: if os.path.exists(imagePath): image = Image.open(imagePath) h = str(imagehash.dhash(image)) filename = os.path.basename(imagePath) sys.stdout.write( "fingerprint created for image # {} of {} \r".format( index, filecount)) sys.stdout.flush() time.sleep(0.1) pre = db(hash=h) if pre: # This image is a duplicate - delete it duplicatecount = duplicatecount + 1 os.remove(classdir + "/" + filename) else: db.insert(filename=filename, hash=h) except Exception, e: print('Error in detectduplicates() function: {}'.format(e)) continue
def inventoryshape(classdir): filecount = len(glob(classdir + "/*.jpg")) print('determining optimal image resolution...') db = Base('shape', save_to_file=False) db.create('filename', 'height', 'width', 'count') index = 0 for imagePath in glob(classdir + "/*.jpg"): index = index + 1 try: img = cv2.imread(imagePath) filename = os.path.basename(imagePath) shape = img.shape h = shape[0] w = shape[1] pre = db(height=h, width=w) # see if there is already an image of this shape in the DB... if pre: # ...if so - update the count rec_id = pre[0]['__id__'] counter = int(pre[0]['count']) counter = counter + 1 record = db[rec_id] db.update(record, count=counter) else: # ...if not - insert the new shape db.insert(filename=filename, height=h, width=w, count=1) sys.stdout.write("reading shape for image #{} of {} \r".format( index, filecount)) sys.stdout.flush() time.sleep(0.1) except Exception, e: print('error processing image {}: {}'.format(imagePath, e)) continue
def test_open_existing(self): db = Base(test_db_name, save_to_file=True) db.create('unique_id', 'name', "active", mode="open") db.insert("123", "N", True) db.commit() # Just verify that it works to open an existing db. # The column names are ignored, therefore they should # equal the old column names db = Base(test_db_name, save_to_file=True) db.create('unique_id2', 'name2', "active2", mode="open") rec = db.insert("123", "N", True) db.commit() self.assertEqual(db.fields, ['unique_id', 'name', "active"]) # mode="override" will overwrite existing db db = Base(test_db_name, save_to_file=True) db.create('unique_id', 'name', "active", mode="override") db.commit() self.assertEqual(len(self.filter_db), 0) # Equals passing mode=None self.assertRaises(IOError, db.create, 'unique_id', 'name', "active") self.assertRaises(ValueError, db.create, 'unique_id', 'name', "active", mode="invalidmode")
class InMemoryDBLite(InMemoryDB): """Class that implements all steps from Dextra's programming challange. Uses pydblite in-memory engine. """ def __init__(self, name: str): logger.debug('Initializing DB.') self.connected = False self.name = name self.db = Base(name, save_to_file=False) def connect(self): logger.debug(f'Connecting to [{self.name}].') # When using pydblite in-memory engine, is unnecessary # connect to a db, so we just set the flag to true self.connected = True def disconnect(self): logger.debug(f'Disconnecting from [{self.name}].') if not self.connected: raise Exception('Not connected to db.') else: # When using pydblite in-memory engine, is unnecessary # disconnect from a db, so we just set the flag to false self.connected = False def create_schema(self, *args): logger.debug(f'Crating schema into [{self.name}].') if not self.connected: raise Exception('Not connected to db.') else: r = self.db.create(*args, mode='override') self.db.commit() return r def insert(self, item: dict): logger.debug(f'Inserting item into [{self.name}].') if not self.connected: raise Exception('Not connected to db.') else: r = self.db.insert(**item) self.db.commit() return r def insert_multiple(self, items: list): logger.debug(f'Inserting multiple items into [{self.name}].') if not self.connected: raise Exception('Not connected to db.') else: for item in items: r = self.db.insert(**item) self.db.commit() return r
def test_sqlite_compat(self): db = Base(test_db_name, save_to_file=False, sqlite_compat=True) db.create('unique_id', 'name', "active", mode="open") self.reset_status_values_for_filter() # Insert 7 entries res = db.insert(self.status) self.assertEqual(res, None) self.assertEqual(len(db), 7) status = [(8, "testname", 0)] res = db.insert(status) self.assertEqual(res, None) self.assertEqual(len(db), 8)
def from_json_to_db(self): thread = '' db = Base("scnscraper/abap.pydb", save_to_file=True) # create new base with field names db.create('url', 'uid', 'type', 'author', 'title', 'date_time', 'tags', 'views', 'answers', 'resolve', 'upvotes', 'text', mode='override') i = 0 with open('scnsraper/threads.json', 'r') as file: for line in file: if (line.endswith(" }\n")): thread += line tokens = re.search( r"url:\s'(.*?)',\suid:\s'(.*?)',\stype:\s'(.*?)',\sauthor:\s'(.*?)',\stitle:\s'(.*?)',\sdate_time:\s'(.*?)',\stags:\s'(.*?)',\sviews:\s'(.*?)',\sanswers:\s'(.*?)',\sresolve:\s'(.*?)',\supvotes:\s'(.*?)', text:\s'((.|\n)*)'\s}", str(thread)) if tokens is not None: db.insert(url=tokens.group(1), uid=tokens.group(2), type=tokens.group(3), author=tokens.group(4), title=tokens.group(5), date_time=tokens.group(6), tags=tokens.group(7), views=tokens.group(8), answers=tokens.group(9), resolve=tokens.group(10), upvotes=tokens.group(11), text=tokens.group(12)) db.commit() print('\n--------------------------------------------\n') thread = '' if (line.startswith(" ]")): print("new page") thread = '' if (line.endswith('\n') and (not line.startswith(" ]\n\n")) and (not line.endswith(" }\n"))): thread += line
class MemoryDict(object): def __init__(self, name="", key="", value=['col']): self.db = Base(name, save_to_file=False) self.db.create(key, *value) self.db.create_index(key) self.key = key self.value = value def give_me_elem(self, key): return eval('self.db._' + self.key + '[key]') def is_in(self, key): return len(self.give_me_elem(key)) > 0 def insert(self, key="", value=[""]): record = self.give_me_elem(key) if len(record) > 0: for i in range(len(self.value)): record[0][self.value[i]] = value[i] else: self.db.insert(key, *value) def pop(self, key): record = self.give_me_elem(key) if len(record) > 0: rec_id = record[0]['__id__'] del self.db[rec_id] def iteritems(self): return list(self.db) def len(self): return len(self.db) def print_all(self): for r in self.db: print(r)
class MemoryQueue(object): def __init__(self, name="", col_names=['col']): self.db = Base(name, save_to_file=False) self.db.create(*col_names) self.st = 0 self.en = 0 def pop(self): if (self.st == self.en): print('Queue Empty') ret = self.db[self.st] del self.db[self.st] self.st += 1 return ret def top(self): if (self.st == self.en): print('Queue Empty') ret = self.db[self.st] return ret def push(self, arg=['']): self.db.insert(*arg) self.en += 1 def print_queue(self): for r in self.db: print(r) def is_empty(self): return self.st == self.en
def pydblite(): from pydblite.pydblite import Base db = Base('dummy', save_to_file=False) # create new base with field names db.create('name', 'age', 'size') # insert new record db.insert(name='homer', age=23, size=1.84) # records are dictionaries with a unique integer key __id__ # simple selection by field value records = db(name="homer") # complex selection by list comprehension res = [r for r in db if 30 > r['age'] >= 18 and r['size'] < 2] print("res:", res) # delete a record or a list of records r = records[0] db.delete(r) list_of_records = [] r = db.insert(name='homer', age=23, size=1.84) list_of_records.append(db[r]) r = db.insert(name='marge', age=36, size=1.94) list_of_records.append(db[r]) # or generator expression for r in (r for r in db if r['name'] in ('homer', 'marge')): # print "record:", r pass db.delete(list_of_records) rec_id = db.insert(name='Bart', age=15, size=1.34) record = db[rec_id] # the record such that record['__id__'] == rec_id # delete a record by its id del db[rec_id] # create an index on a field db.create_index('age') # update rec_id = db.insert(name='Lisa', age=13, size=1.24) # direct access by id record = db[rec_id] db.update(record, age=24) # add and drop fields db.add_field('new_field', default=0) db.drop_field('name') # save changes on disk db.commit()
def pydblite(): from pydblite.pydblite import Base db = Base('dummy', save_to_file=False) # create new base with field names db.create('name', 'age', 'size') # insert new record db.insert(name='homer', age=23, size=1.84) # records are dictionaries with a unique integer key __id__ # simple selection by field value records = db(name="homer") # complex selection by list comprehension res = [r for r in db if 30 > r['age'] >= 18 and r['size'] < 2] print "res:", res # delete a record or a list of records r = records[0] db.delete(r) list_of_records = [] r = db.insert(name='homer', age=23, size=1.84) list_of_records.append(db[r]) r = db.insert(name='marge', age=36, size=1.94) list_of_records.append(db[r]) # or generator expression for r in (r for r in db if r['name'] in ('homer', 'marge')): # print "record:", r pass db.delete(list_of_records) rec_id = db.insert(name='Bart', age=15, size=1.34) record = db[rec_id] # the record such that record['__id__'] == rec_id # delete a record by its id del db[rec_id] # create an index on a field db.create_index('age') # update rec_id = db.insert(name='Lisa', age=13, size=1.24) # direct access by id record = db[rec_id] db.update(record, age=24) # add and drop fields db.add_field('new_field', default=0) db.drop_field('name') # save changes on disk db.commit()
def test_open(self): db = Base('dummy', save_to_file=False) db.create('name', 'age', 'size') db.insert(name='homer', age=23, size=1.84)
class PyDbLiteTestCase(Generic, unittest.TestCase): def setUp(self): # NOQA self.first_record_id = 0 filter_db = Base(test_db_name, save_to_file=False) filter_db.create('unique_id', 'name', "active", mode="override") self.filter_db = filter_db def tearDown(self): # NOQA if os.path.isfile(test_db_name): os.remove(test_db_name) elif os.path.isdir(test_db_name): os.rmdir(test_db_name) def setup_db_for_filter(self): self.reset_status_values_for_filter() for d in self.status: res = self.filter_db.insert(**d) self.assertEqual(res, 6) def test_open(self): db = Base('dummy', save_to_file=False) db.create('name', 'age', 'size') db.insert(name='homer', age=23, size=1.84) def test_open_file_with_existing_dir(self): os.mkdir(test_db_name) db = Base(test_db_name, save_to_file=True) # A dir with that name exists self.assertRaises(IOError, db.create, 'unique_id', 'name', "active", mode="open") def test_open_existing(self): db = Base(test_db_name, save_to_file=True) db.create('unique_id', 'name', "active", mode="open") db.insert("123", "N", True) db.commit() # Just verify that it works to open an existing db. # The column names are ignored, therefore they should # equal the old column names db = Base(test_db_name, save_to_file=True) db.create('unique_id2', 'name2', "active2", mode="open") rec = db.insert("123", "N", True) db.commit() self.assertEqual(db.fields, ['unique_id', 'name', "active"]) # mode="override" will overwrite existing db db = Base(test_db_name, save_to_file=True) db.create('unique_id', 'name', "active", mode="override") db.commit() self.assertEqual(len(self.filter_db), 0) # Equals passing mode=None self.assertRaises(IOError, db.create, 'unique_id', 'name', "active") self.assertRaises(ValueError, db.create, 'unique_id', 'name', "active", mode="invalidmode") def test_open_memory(self): db = Base(":memory:") self.assertFalse(db.save_to_file) def test_open_memory_with_existing_filename(self): self.filter_db = Base(test_db_name, save_to_file=True) self.filter_db.create('unique_id', 'name', "active", mode="override") self.filter_db.commit() db = Base(test_db_name, save_to_file=False) db.open() self.assertEqual(db.fields, ['unique_id', 'name', "active"]) db = Base(test_db_name, save_to_file=False) db.create('unique_id2', 'name2', "active2", mode="override") self.assertEqual(db.fields, ['unique_id2', 'name2', "active2"]) def test_insert_list(self): status = (8, "testname", 0) # Insert 7 entries rec = self.filter_db.insert(status) self.assertEqual(rec, 0) self.assertEqual(self.filter_db[rec]["unique_id"], status) def test_sqlite_compat_insert_list(self): self.filter_db = Base(test_db_name, save_to_file=False, sqlite_compat=True) self.filter_db.create('unique_id', 'name', "active", mode="override") status = [(8, "testname", 0)] # Insert 1 entries rec = self.filter_db.insert(status) self.assertEqual(rec, None) self.assertEqual(len(self.filter_db), 1) self.assertEqual(self.filter_db[0]["unique_id"], 8) self.assertEqual(self.filter_db[0]["name"], "testname") self.assertEqual(self.filter_db[0]["active"], 0) def test_sqlite_compat(self): db = Base(test_db_name, save_to_file=False, sqlite_compat=True) db.create('unique_id', 'name', "active", mode="open") self.reset_status_values_for_filter() # Insert 7 entries res = db.insert(self.status) self.assertEqual(res, None) self.assertEqual(len(db), 7) status = [(8, "testname", 0)] res = db.insert(status) self.assertEqual(res, None) self.assertEqual(len(db), 8)
class PathSegmentDB(object): """Simple database for paths using PyDBLite""" def __init__(self, segment_ttl=None, max_res_no=None, labels=None): # pragma: no cover """ :param int segment_ttl: The TTL for each record in the database (in s) or None to just use the segment's expiration time. :param int max_res_no: Number of results returned for a query. :param dict labels: Labels added to the exported metrics. The following labels are supported: - server_id: A unique identifier of the server that is exporting - isd_as: The ISD_AS of where the server is running - type: A generic label for the type of the revocations. """ self._db = None self._lock = threading.Lock() self._segment_ttl = segment_ttl self._max_res_no = max_res_no self._labels = labels if self._labels: self._init_metrics() self._setup_db() def _init_metrics(self): # pragma: no cover SEGS_TOTAL.labels(**self._labels).set(0) SEGS_BYTES.labels(**self._labels).set(0) SEGS_ADDED.labels(**self._labels).inc(0) SEGS_REMOVED.labels(**self._labels).inc(0) def _setup_db(self): # pragma: no cover with self._lock: self._db = Base("", save_to_file=False) self._db.create('record', 'id', 'first_isd', 'first_as', 'last_isd', 'last_as', 'sibra', mode='override') self._db.create_index('id') self._db.create_index('last_isd') self._db.create_index('last_as') def __getitem__(self, seg_id): # pragma: no cover """Return a path object by segment id.""" with self._lock: recs = self._db(id=seg_id) if recs: return recs[0]['record'].pcb return None def __contains__(self, seg_id): # pragma: no cover with self._lock: recs = self._db(id=seg_id) return len(recs) > 0 def flush(self): # pragma: no cover """Removes all records from the database.""" if self._labels: SEGS_REMOVED.labels(**self._labels).inc(len(self)) SEGS_TOTAL.labels(**self._labels).set(0) SEGS_BYTES.labels(**self._labels).set(0) self._setup_db() def update(self, pcb, reverse=False): """ Insert path into database. Return the result of the operation. """ first_ia = pcb.first_ia() last_ia = pcb.last_ia() if reverse: first_ia, last_ia = last_ia, first_ia if self._segment_ttl: now = int(SCIONTime.get_time()) record = PathSegmentDBRecord(pcb, now + self._segment_ttl) else: record = PathSegmentDBRecord(pcb) with self._lock: recs = self._db(id=record.id, sibra=pcb.is_sibra()) assert len(recs) <= 1, "PathDB contains > 1 path with the same ID" if not recs: self._db.insert(record, record.id, first_ia[0], first_ia[1], last_ia[0], last_ia[1], pcb.is_sibra()) logging.debug("Added segment from %s to %s: %s", first_ia, last_ia, pcb.short_desc()) if self._labels: SEGS_ADDED.labels(**self._labels).inc() SEGS_TOTAL.labels(**self._labels).inc() SEGS_BYTES.labels(**self._labels).inc(len(pcb)) return DBResult.ENTRY_ADDED cur_rec = recs[0]['record'] if pcb.get_expiration_time() < cur_rec.pcb.get_expiration_time(): return DBResult.NONE old_pcb = cur_rec.pcb cur_rec.pcb = pcb if self._segment_ttl: cur_rec.exp_time = now + self._segment_ttl else: cur_rec.exp_time = pcb.get_expiration_time() if self._labels: SEGS_ADDED.labels(**self._labels).inc() SEGS_BYTES.labels(**self._labels).inc(len(pcb) - len(old_pcb)) return DBResult.ENTRY_UPDATED def delete(self, segment_id): """Deletes a path segment with a given ID.""" with self._lock: recs = self._db(id=segment_id) if not recs: return DBResult.NONE self._db.delete(recs) assert len(recs) == 1 if self._labels: SEGS_REMOVED.labels(**self._labels).inc() SEGS_TOTAL.labels(**self._labels).dec() SEGS_BYTES.labels(**self._labels).dec( len(recs[0]['record'].pcb)) return DBResult.ENTRY_DELETED def delete_all(self, segment_ids): """ Deletes paths with the given IDs and returns the number of deletions. :param list segment_ids: The segment IDs to remove. :returns: The number of deletions. :rtype: int """ deletions = 0 for seg_id in segment_ids: if self.delete(seg_id) == DBResult.ENTRY_DELETED: deletions += 1 return deletions def __call__(self, *args, full=False, **kwargs): """ Selection by field values. Returns a sorted (path fidelity) list of paths according to the criterias specified. :param bool full: Return list of results not bounded by self._max_res_no. """ kwargs = self._parse_call_kwargs(kwargs) with self._lock: recs = self._db(*args, **kwargs) valid_recs = self._exp_call_records(recs) return self._sort_call_pcbs(full, valid_recs) def _parse_call_kwargs(self, kwargs): # pragma: no cover first_ia = kwargs.pop("first_ia", None) if first_ia: kwargs["first_isd"] = first_ia[0] kwargs["first_as"] = first_ia[1] last_ia = kwargs.pop("last_ia", None) if last_ia: kwargs["last_isd"] = last_ia[0] kwargs["last_as"] = last_ia[1] if "sibra" not in kwargs: kwargs["sibra"] = False return kwargs def _exp_call_records(self, recs): """Remove expired segments from the db.""" now = int(SCIONTime.get_time()) ret = [] expired = [] for r in recs: if r['record'].exp_time < now: expired.append(r) logging.debug("Path-Segment expired: %s", r['record'].pcb.short_desc()) continue ret.append(r) if expired: self._db.delete(expired) return ret def _sort_call_pcbs(self, full, valid_recs): # pragma: no cover seg_recs = sorted([r['record'] for r in valid_recs], key=lambda x: x.fidelity) if self._max_res_no and not full: seg_recs = seg_recs[:self._max_res_no] return [r.pcb for r in seg_recs] def __len__(self): # pragma: no cover with self._lock: return len(self._db)
date_table.insert(date=date, julian_date_num=julian_date_num, sequence=sequence, week_day_num=week_day_num, day_name=day_name, day_short_name=day_short_name, month_week_num=month_week_num, month_week_begin_date=month_week_begin_date, month_week_end_date=month_week_end_date, quarter_week_num=quarter_week_num, quarter_week_begin_date=quarter_week_begin_date, quarter_week_end_date=quarter_week_end_date, year_week_num=year_week_num, year_week_begin_date=year_week_begin_date, year_week_end_date=year_week_end_date, month_day_num=month_day_num, month_num=month_num, month_name=month_name, month_short_name=month_short_name, month_begin_date=month_begin_date, month_end_date=month_end_date, quarter_day_num=quarter_day_num, quarter_num=quarter_num, quarter_name=quarter_name, quarter_begin_date=quarter_begin_date, quarter_end_date=quarter_end_date, year_day_num=year_day_num, year_num=year_num, year_begin_date=year_begin_date, year_end_date=year_end_date, dd_mon_yyyy=dd_mon_yyyy, dd_month_yyyy=dd_month_yyyy, mon_dd_yyyy=mon_dd_yyyy, month_dd_yyyy=month_dd_yyyy, dd_mm_yyyy=dd_mm_yyyy, mm_dd_yyyy=mm_dd_yyyy, mm_dd_yy=mm_dd_yy, dd_mm_yy=dd_mm_yy, m_d_yy=m_d_yy, d_m_yy=d_m_yy, weekday_flag=weekday_flag, week_first_day_flag=week_first_day_flag, week_last_day_flag=week_last_day_flag, month_first_day_flag=month_first_day_flag, month_last_day_flag=month_last_day_flag, quarter_first_day_flag=quarter_first_day_flag, quarter_last_day_flag=quarter_last_day_flag, year_first_day_flag=year_first_day_flag, year_last_day_flag=year_last_day_flag, leap_year_flag=leap_year_flag, is_holiday=is_holiday, holiday_name=holiday_name, nth_weekday=nth_weekday)
class DataStoring(): #Inizialize an instantiated object by opening json file and the database def __init__(self): self.out_file = open("scnscraper/abap.json", "a") self.out_file.close() self.db = Base("scnscraper/abap.pydb") if self.db.exists(): self.db.open() else: self.db.create('url', 'uid', 'type', 'author', 'title', 'date_time', 'tags', 'views', 'answers', 'resolve', 'upvotes', 'text') #for each thread scraped, insert it into db def insert_items_into_db(self, threads): for thread in threads: item = SapItem() # New Item instance item = thread try: # Insert into db self.db.insert(url = str(item["url"]), uid = str(item["uid"]), type= str(item["type"] ), author=str(item["author"]), title = str(item["title"]), date_time = str(item["date_time"] ),tags = str(item["tags"] ), views = str(item["views"] ), answers = str(item["answers"] ), resolve = str(item["resolve"] ), upvotes = str(item["upvotes"] ), text = str(item["text"])) except UnicodeEncodeError: print("Unicode Encode Exception!") #save changes on disk self.db.commit() # for each thread scraped, initialize the string to insert into json file def threads_to_str(self, threads): out_string = "[ " if threads.__len__() == 0: return "" for thread in threads: item = SapItem() item = thread try: out_string += "{ url: '" + str(item["url"] ) + "', " + "uid: '" + str(item["uid"] ) + "', "\ "type: '" + str(item["type"] ) + "', "\ "author: '"+ str(item["author"]) + "', " \ "title: '"+ str(item["title"]) + "', "\ "date_time: '"+ str(item["date_time"] ) + "', " \ "tags: '"+ str(item["tags"] ) + "', " \ "views: '"+ str(item["views"] ) + "', "\ "answers: '"+ str(item["answers"] ) + "', " \ "resolve: '"+ str(item["resolve"] ) + "', " \ "upvotes: '"+ str(item["upvotes"] ) + "', "\ "text: '" + str(item["text"]) + "' }\n" except UnicodeEncodeError: print("Unicode Encode Exception!") out_string += " ]\n\n" return out_string #for each thread scraped, insert it into json file def insert_items_into_file(self, threads): try: self.out_file = open("scnscraper/abap.json", "a") # open in append mode #convert into string and insert into file self.out_file.write(self.threads_to_str(threads)) self.out_file.close() except: print('Exception in writing file') self.out_file.close() # read the web page index def read_index_from_file(self): if os.path.exists('scnscraper/index.txt'): with open('scnscraper/index.txt') as f: index = int(f.readline()) f.close() else: f = open('scnscraper/index.txt', 'w') index = 2 f.write(str(index)) f.close() return index # Write the web page index def write_index_into_file(self, i): f = open('scnscraper/index.txt', 'w') f.write(str(i)) f.close() # Convert the content of json file into a new db def from_json_to_db(self): thread = '' db = Base("scnscraper/abap.pydb", save_to_file= True) # create new base with field names db.create('url', 'uid', 'type', 'author', 'title', 'date_time', 'tags', 'views', 'answers', 'resolve', 'upvotes', 'text', mode='override') i=0 with open('scnsraper/threads.json', 'r') as file: for line in file: if(line.endswith(" }\n")): thread += line tokens = re.search(r"url:\s'(.*?)',\suid:\s'(.*?)',\stype:\s'(.*?)',\sauthor:\s'(.*?)',\stitle:\s'(.*?)',\sdate_time:\s'(.*?)',\stags:\s'(.*?)',\sviews:\s'(.*?)',\sanswers:\s'(.*?)',\sresolve:\s'(.*?)',\supvotes:\s'(.*?)', text:\s'((.|\n)*)'\s}", str(thread)) if tokens is not None: db.insert(url = tokens.group(1), uid = tokens.group(2), type= tokens.group(3), author=tokens.group(4), title = tokens.group(5), date_time = tokens.group(6), tags = tokens.group(7), views = tokens.group(8), answers = tokens.group(9), resolve = tokens.group(10), upvotes = tokens.group(11), text = tokens.group(12)) db.commit() print ('\n--------------------------------------------\n') thread = '' if(line.startswith(" ]")): print("new page") thread = '' if(line.endswith('\n') and (not line.startswith(" ]\n\n")) and (not line.endswith(" }\n"))): thread += line def state_extraction(): db = Base("scnscraper/abap.pydb") if db.exists(): db.open() record = db(type = "Question") print("# discussion scraped: " + str(record.__len__())) print("Answered: " + str(db(resolve = "Answered.").__len__())) print("Answered with solution: "+ str(db(resolve = "solution").__len__())) print("Not Answered: " + str(db(resolve = "Not Answered.").__len__())) print("Assumed Answered: " + str(db(resolve = "Assumed Answered.").__len__())) state_extraction = staticmethod(state_extraction)
class DataStoring(): #Inizialize an instantiated object by opening json file and the database def __init__(self): self.out_file = open("scnscraper/abap.json", "a") self.out_file.close() self.db = Base("scnscraper/abap.pydb") if self.db.exists(): self.db.open() else: self.db.create('url', 'uid', 'type', 'author', 'title', 'date_time', 'tags', 'views', 'answers', 'resolve', 'upvotes', 'text') #for each thread scraped, insert it into db def insert_items_into_db(self, threads): for thread in threads: item = SapItem() # New Item instance item = thread try: # Insert into db self.db.insert(url=str(item["url"]), uid=str(item["uid"]), type=str(item["type"]), author=str(item["author"]), title=str(item["title"]), date_time=str(item["date_time"]), tags=str(item["tags"]), views=str(item["views"]), answers=str(item["answers"]), resolve=str(item["resolve"]), upvotes=str(item["upvotes"]), text=str(item["text"])) except UnicodeEncodeError: print("Unicode Encode Exception!") #save changes on disk self.db.commit() # for each thread scraped, initialize the string to insert into json file def threads_to_str(self, threads): out_string = "[ " if threads.__len__() == 0: return "" for thread in threads: item = SapItem() item = thread try: out_string += "{ url: '" + str(item["url"] ) + "', " + "uid: '" + str(item["uid"] ) + "', "\ "type: '" + str(item["type"] ) + "', "\ "author: '"+ str(item["author"]) + "', " \ "title: '"+ str(item["title"]) + "', "\ "date_time: '"+ str(item["date_time"] ) + "', " \ "tags: '"+ str(item["tags"] ) + "', " \ "views: '"+ str(item["views"] ) + "', "\ "answers: '"+ str(item["answers"] ) + "', " \ "resolve: '"+ str(item["resolve"] ) + "', " \ "upvotes: '"+ str(item["upvotes"] ) + "', "\ "text: '" + str(item["text"]) + "' }\n" except UnicodeEncodeError: print("Unicode Encode Exception!") out_string += " ]\n\n" return out_string #for each thread scraped, insert it into json file def insert_items_into_file(self, threads): try: self.out_file = open("scnscraper/abap.json", "a") # open in append mode #convert into string and insert into file self.out_file.write(self.threads_to_str(threads)) self.out_file.close() except: print('Exception in writing file') self.out_file.close() # read the web page index def read_index_from_file(self): if os.path.exists('scnscraper/index.txt'): with open('scnscraper/index.txt') as f: index = int(f.readline()) f.close() else: f = open('scnscraper/index.txt', 'w') index = 2 f.write(str(index)) f.close() return index # Write the web page index def write_index_into_file(self, i): f = open('scnscraper/index.txt', 'w') f.write(str(i)) f.close() # Convert the content of json file into a new db def from_json_to_db(self): thread = '' db = Base("scnscraper/abap.pydb", save_to_file=True) # create new base with field names db.create('url', 'uid', 'type', 'author', 'title', 'date_time', 'tags', 'views', 'answers', 'resolve', 'upvotes', 'text', mode='override') i = 0 with open('scnsraper/threads.json', 'r') as file: for line in file: if (line.endswith(" }\n")): thread += line tokens = re.search( r"url:\s'(.*?)',\suid:\s'(.*?)',\stype:\s'(.*?)',\sauthor:\s'(.*?)',\stitle:\s'(.*?)',\sdate_time:\s'(.*?)',\stags:\s'(.*?)',\sviews:\s'(.*?)',\sanswers:\s'(.*?)',\sresolve:\s'(.*?)',\supvotes:\s'(.*?)', text:\s'((.|\n)*)'\s}", str(thread)) if tokens is not None: db.insert(url=tokens.group(1), uid=tokens.group(2), type=tokens.group(3), author=tokens.group(4), title=tokens.group(5), date_time=tokens.group(6), tags=tokens.group(7), views=tokens.group(8), answers=tokens.group(9), resolve=tokens.group(10), upvotes=tokens.group(11), text=tokens.group(12)) db.commit() print('\n--------------------------------------------\n') thread = '' if (line.startswith(" ]")): print("new page") thread = '' if (line.endswith('\n') and (not line.startswith(" ]\n\n")) and (not line.endswith(" }\n"))): thread += line def state_extraction(): db = Base("scnscraper/abap.pydb") if db.exists(): db.open() record = db(type="Question") print("# discussion scraped: " + str(record.__len__())) print("Answered: " + str(db(resolve="Answered.").__len__())) print("Answered with solution: " + str(db(resolve="solution").__len__())) print("Not Answered: " + str(db(resolve="Not Answered.").__len__())) print("Assumed Answered: " + str(db(resolve="Assumed Answered.").__len__())) state_extraction = staticmethod(state_extraction)