def get_external_dataset(): logger.info("Get_external_dataset") with open(fjoin(external_data, 'pos.txt'), 'r') as infile: pos_reviews = infile.readlines() with open(fjoin(external_data, 'neg.txt'), 'r') as infile: neg_reviews = infile.readlines() with open(fjoin(external_data, 'unsup.txt'), 'r') as infile: unsup_reviews = infile.readlines() logger.info("pos:{} neg:{} unsup:{}".format(len(pos_reviews), len(neg_reviews), len(unsup_reviews))) y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews)))) x_train, x_test, y_train, y_test = train_test_split(np.concatenate( (pos_reviews, neg_reviews)), y, test_size=0.2) x_train = cleanText(x_train) x_test = cleanText(x_test) unsup_reviews = cleanText(unsup_reviews) x_train = labelizeReviews(x_train, 'EXTERNAL_TRAIN') x_test = labelizeReviews(x_test, 'EXTENAL_TEST') unsup_reviews = labelizeReviews(unsup_reviews, 'EXTENAL_UNSUP') return x_train, x_test, unsup_reviews, y_train, y_test
def extract_sentence(fdir, sdir): for parents, dirnames, filenames in os.walk(fdir): for filename in filenames: #if not filename == '1d16a571f14fb1032bc19e9314a46deb.cmp.txt': # continue logger.info(filename) save_file = fjoin(sdir, filename) with open(fjoin(parents, filename)) as infile: file = [f.decode("utf-8") for f in infile.readlines()] file = decompose(''.join(file)) file.to_csv(save_file)
def vendor(filename): if app.debug: if "vendor" in request.path: filename = fjoin('vendor', secure_filename(filename)) else: filename = secure_filename(filename) with open(fjoin('client', 'src', 'js', filename)) as f: script = f.read() response = make_response(script) response.headers['Content-Type'] = 'application/javascript' return response
def generate_train_dataset(annotation_dir, sentence_dir): train_pd = pd.DataFrame() for parents, dirnames, filenames in os.walk(sentence_dir): for filename in filenames: annotation = fjoin(annotation_dir, filename[:-len('.cmp.txt')]+'.best.xml') print annotation annotation_pd = pd.read_csv(annotation) sentences_pd = pd.read_csv(fjoin(parents, filename)) for index, row in annotation_pd.iterrows(): sent, label = find_sentence(row, sentences_pd) train_pd = train_pd.append({'sent':sent, 'label':label}, ignore_index=True) logger.info(train_pd.columns) return train_pd
def train(pos, neg, x_train, x_test, external_x_train, external_x_test, external_unsup_reviews, size=400, epoch_num=10): logger.info("Train sentence model(dm, dbow)") model_dm = gensim.models.Doc2Vec(min_count=1, \ window=10, \ size=size, \ sample=1e-3, \ negative=5, \ workers=6) model_dbow = gensim.models.Doc2Vec(min_count=1, \ window=10, \ size=size, \ sample=1e-3, \ negative=5, \ dm=0, \ workers=6) # use all words build vocab vocab_document = x_train + x_test + external_x_train + external_x_test + external_unsup_reviews model_dm.build_vocab(vocab_document) model_dbow.build_vocab(vocab_document) # repeat train, everytime break the sequence to improve accuracy tmp_x_train = x_train + x_test + external_x_train + external_x_test print tmp_x_train[1:2] for epoch in range(epoch_num): logger.info("train epoch {}".format(epoch)) random.shuffle(tmp_x_train) model_dm.train(tmp_x_train) model_dbow.train(tmp_x_train) # train test dataset ''' tmp_x_test = x_test for epoch in range(epoch_num): logger.info("test epoch {}".format(epoch)) random.shuffle(tmp_x_test) model_dm.train(tmp_x_test) model_dbow.train(tmp_x_test) ''' model_dm.save(fjoin(model_dir, 'doc2vec_dm')) model_dbow.save(fjoin(model_dir, 'doc2vec_dbow')) return model_dm, model_dbow
def pps_calculator(self): pps_file_path = self.get_pps_file_path() with open(fjoin(pps_file_path, 'rx_packets')) as f: rx_origin = int(f.read()) with open(fjoin(pps_file_path, 'tx_packets')) as f: tx_origin = int(f.read()) sleep(5) with open(fjoin(pps_file_path, 'rx_packets')) as f: rx_now = int(f.read()) with open(fjoin(pps_file_path, 'tx_packets')) as f: tx_now = int(f.read()) rx_pps = (rx_now - rx_origin) / 5 tx_pps = (tx_now - tx_origin) / 5 self.pps = {'rx_pps': rx_pps, 'tx_pps': tx_pps} self.pps_timer_starter()
def main(args): checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no) if isfile(checkpoint_path + "frontier_map.pt"): frontier_map = pickle.load( open(checkpoint_path + "frontier_map.pt", "rb")) else: raise Exception("checkpoint not found") count = 0 for file in os.listdir(args.cdp): path = fjoin(args.cdp, file) res = pickle.load(open(path, "rb")) url = res['docno'] inlinks = list(frontier_map[url].inlinks) count += 1 write_to_graph(url, inlinks, count) fo.close()
def get_video_filelist(basepath): videos = [] for root, dirs, files in os.walk(basepath): for name in files: if name.split('.')[-1] in allowed_extensions: videos.append(fjoin(root,name)) return videos
def make_list(basepath): allowed_extensions = ['m4v', 'mp4', 'mov', 'wmv'] videos = [] for root, dirs, files in os.walk(basepath): for name in files: if name.split('.')[-1] in allowed_extensions: videos.append(fjoin(root,name)) return videos
def handle_imdb_dataset(sentiment): print "in" data_type = ['train', 'test'] save_file = fjoin("/home/apple/best/external_data/aclImdb", sentiment + ".txt") save_file_handler = open(save_file, 'w+') reviews = [] for dt in data_type: imdb_dir = fjoin("/home/apple/best/external_data/aclImdb", dt + "/" + sentiment) print imdb_dir for parent, dirnames, filenames in os.walk(imdb_dir): print len(filenames) for filename in filenames: print filename with open(fjoin(parent, filename), 'r') as infile: reviews.append("\n".join(infile.readlines()) + "\n") save_file_handler.writelines(reviews) save_file_handler.close()
def add_reference(self, extracted_num): plt.imshow(extracted_num) plt.show() isnum = input("is number: ") if isnum == "yes" or isnum == "y": n = input("which number: ") filename = "reference_num_" + str(n) + ".png" extracted_num.save(fjoin(num_ref_folder, filename))
def remove_office_duplicates(file_path): current_folder = getcwd() if not ("/" in file_path or "\\" in file_path): file_path = fjoin(current_folder, file_path) if not fexists(file_path) or not isfile(file_path): print "File does not exist: %s" % file_path return False print "\nFixing: %s" % file_path with open(file_path, 'r+') as the_f: data = the_f.read() soup = BeautifulSoup(data, "lxml") divs_list = soup.find_all('div') # import ipdb # ipdb.set_trace() duplicate_total_num = 0 prev_div = None for div in divs_list: # print len(div.contents) try: if prev_div['style'] == div['style']: prev_div.replace_with(div) duplicate_total_num+=1 # print div['style'] except (KeyError, TypeError): pass prev_div=div if duplicate_total_num: print "%s duplicate borders are removed" % duplicate_total_num the_f.seek(0) the_f.write(str(soup)) the_f.truncate() else: print "No duplicate borders were found."
def main(args): es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) #create the settings and mapping of the index create_index(args.index, es) checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no) if isfile(checkpoint_path + "frontier.pt"): frontier = pickle.load(open(checkpoint_path + "frontier.pt", "rb")) frontier_map = pickle.load( open(checkpoint_path + "frontier_map.pt", "rb")) id_to_url = pickle.load(open(checkpoint_path + "id_to_url.pt", "rb")) links_crawled = pickle.load( open(checkpoint_path + "links_crawled.pt", "rb")) current_wave = pickle.load( open(checkpoint_path + "current_wave.pt", "rb")) else: raise Exception("checkpoint not found") #Load all the pickles of the crawled data for file in os.listdir(args.cdp): path = fjoin(args.cdp, file) res = pickle.load(open(path, "rb")) url = res['docno'] title = res['head'] content = res['text'] inlinkData = list(frontier_map[url].inlinks) outlinkData = list(frontier_map[url].outlinks) print("inlink data : ", inlinkData) inlinks = json.dumps(inlinkData) outlinks = json.dumps(outlinkData) print("inlinks after json dumping : ", inlinks) store_in_ES(args.index, url, title, content, inlinks, outlinks, es)
def config(file): root = os.path.splitext(os.path.basename(file))[0] # config logger, create 2 handler(file, console) logger = logging.getLogger('BEST.{}'.format(root)) logger_fh = logging.FileHandler(fjoin('logs', 'BEST-{}.log'.format(root))) logger_ch = logging.StreamHandler() logger_formatter = logging.Formatter( '[%(levelname)s] %(asctime)s %(filename)s [line:%(lineno)d]: %(message)s' ) logger_fh.setFormatter(logger_formatter) logger_ch.setFormatter(logger_formatter) logger.addHandler(logger_fh) logger.addHandler(logger_ch) logger.setLevel(logging.DEBUG) return logger
def generate_image(self, local_path, local_title, remote_url): local_file = fjoin(local_path, local_title) try: os.stat(local_file) except OSError: try: log.debug("Source: %s" % remote_url) log.debug("Dest: %s" % local_file) fs.download_file(remote_url, local_file) except OSError: log.critical("Can't open %s for writing." % local_file) print "Can't open %s for writing." % local_file sys.exit(1) return local_file else: return local_file
def init_log(level, appdirs): levels = {'debug': logging.DEBUG, 'info': logging.INFO} log_filename = fjoin(appdirs.user_log_dir, "%s.log" % appdirs.appname) msg_fmt = '[%(asctime)s] %(name)-12s %(levelname)-8s %(message)s' date_fmt = '%m/%d %H:%M' level = levels.get(level, logging.NOTSET) try: logging.basicConfig(level=level, format=msg_fmt, datefmt=date_fmt, filename=LOG_FILENAME, filemode='w') except IOError: logging.basicConfig(level=level, format=msg_fmt, datefmt=date_fmt, stream=sys.stderr)
def read_data(dir, savepath=''): data = {} if os.path.isfile(savepath): with open(savepath, "rb") as handle: data = pickle.load(handle) return data files = os.listdir(dir) for file in tqdm.tqdm(files): path = fjoin(dir, file) text = open(path, 'r', encoding='ISO-8859-1').read() subject, body_text = get_text_from_email(text) data[file] = [subject, body_text] with open(savepath, 'wb') as handle: pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) return data
def add_new_reference(self, item): plt.imshow(item) plt.show() self.incremental_ref_num += 1 filename = fjoin(references_folder, f"reference_{self.incremental_ref_num}.png") name = input("Give item name:\n> ") newir = ItemReference(filename, name, item) newir.image.save(newir.filename) self.references.append(newir) self.save_reference_file() return newir
def rewind(self): try: media = list(Media.select()) except SQLObjectNotFound: msg = 'No media found with which to rewind' log.info(msg) sys.exit(msg) else: for medium in media: if medium.file_URI: if medium.original_file_URI: log.debug('Moving: %s to %s' % (medium.file_URI, medium.original_file_URI)) shutil.move(medium.file_URI, medium.original_file_URI) else: log.debug('Original file location does not exist') source_path = data.get_setting('source_path') media_directory = medium.media_type try: log.debug("Franchise: %s" % medium.franchise.name) new_title = medium.franchise.name except SQLObjectNotFound: log.debug('No franchise: %s' % medium.title) new_title = medium.title if medium.media_type == data.media_types[data.TV]: filename = '%s S%sE%s.%s' % (new_title, medium.season_number, medium.episode_number, medium.codec) else: filename = '%s.%s' % (new_title, medium.codec) dest = fjoin(source_path, media_directory, filename) log.debug('Moving: %s to %s' % (medium.file_URI, dest)) shutil.move(medium.file_URI, dest) medium.file_URI = dest else: msg = "%s can't be rewound." % medium.title log.error(msg) pub.sendMessage('STD_OUT', msg=msg)
def index(): if app.debug: with open(fjoin('client', 'src', 'index.html')) as f: index = f.read() return index
def get_pps_file_path(self): basepath = '/sys/class/net' for flist in listdir(basepath): if flist[0] == 'e': return fjoin(basepath, flist, 'statistics')
def process_files(self): filelist = fs.make_list(fs.get_basepath(data.get_setting('source_path'))) self.org_type = data.get_setting('organization_method') for videofile in filelist: original_file_location = videofile if not self.exists_in_db(videofile): (path, video_filename, self.video_ext) = fs.fn_to_parts(videofile) # what are we looking up? tv? movie? if data.Media.media_types[data.Media.MOVIES].lower() in path.lower(): self.lookup_movie(video_filename) elif data.Media.media_types[data.Media.TV].lower() in path.lower(): self.lookup_tv(video_filename) else: log.critical("Sorry, I can't figure out how your video files are organized") print "Sorry, I can't figure out how your video files are organized" sys.exit(1) # were there multiple results for this? if len(self.results) > 1 and not self.options.first: selected = self.resolve_multiple_results(video_filename, self.results) result = self.results[selected] process_vid = True elif len(self.results) == 1 or self.options.first: result = self.results[0] process_vid = True else: log.debug("No matches, skipping file") process_vid = False if process_vid: log.debug("Result: %s" % result.title) self.video = data.Media() self.video.fromAPIMedia(result) else: process_vid = True if process_vid: # should we organize? if data.get_setting('master_org'): self.video.file_URI = self.organize_file(videofile) self.video.original_file_URI = original_file_location else: self.video.file_URI = videofile self.video.original_file_URI = videofile # process the image for the video poster_filename = "%s.jpg" % self.get_filename_base(self.video.file_URI) if self.video.poster_remote_URI: self.generate_image(self.path, poster_filename, self.video.poster_remote_URI) elif self.video.media_type == data.media_types[data.TV] and self.folder_poster: shutil.copy2(self.video.franchise.poster_local_URI, fjoin(self.path, poster_filename)) # process the xml for the video if we're making individual # videofiles. if not, we'll process it all at the end if self.org_type == 'xml': self.generate_videoxml(self.path, self.video) try: del self.results del result del self.video except AttributeError: pass # we are going to generate a master video xml file containing all # entries if self.org_type == 'dir': self.generate_video_directory()
def organize_file(self, videofile): self.path = data.get_setting('dest_path') movies_by_genre = data.get_setting('movies_by_genre') tv_by_genre = data.get_setting('tv_series_by_genre') tv_by_series = data.get_setting('tv_series_by_series') log.debug("Path: %s" % self.path) mt = self.video.media_type tv = data.media_types[data.TV] movies = data.media_types[data.MOVIES] if self.video.media_type not in self.path: self.path = fjoin(self.path, self.video.media_type) log.debug("Missing media type in path. New path: %s" % self.path) if mt == movies: log.debug("MOVIES") if movies_by_genre: self.path = fjoin(self.path, self.clean_name_for_fs(self.video.genres[0].name)) log.debug("Organizing movies by genre. New path: %s" % self.path) elif mt == tv: log.debug("TV SHOWS") if tv_by_genre: self.path = fjoin(self.path, self.clean_name_for_fs(self.video.genres[0].name)) log.debug('Organizing TV by genre. New path: %s' % self.path) if tv_by_series: # series level directory self.path = fjoin(self.path, self.clean_name_for_fs(self.video.franchise.name)) self._make_path(self.path) if self.org_type == 'xml': # for videoxml, the images need to be same name as the # objects they represent (image_path, image_filename) = self.path.rsplit('/',1) image_filename += '.jpg' self.folder_poster = self.generate_image(image_path, image_filename, self.video.franchise.poster_remote_URI) log.debug("Local poster URI: %s" % self.folder_poster) self.video.franchise.poster_local_URI = self.folder_poster else: self.folder_poster = self.generate_image(self.path, 'poster.jpg', self.video.franchise.poster_remote_URI) log.debug("Local poster URI: %s" % self.folder_poster) self.video.franchise.poster_local_URI = self.folder_poster log.debug("Adding franchise. New path: %s" % self.path) log.debug("Adding poster image %s" % self.folder_poster) # season level directory season = "Season %s" % self.video.season_number self.path = fjoin(self.path, season) self._make_path(self.path) if self.org_type == 'xml': image_dest = self.path+".jpg" log.debug("Franchise: %s" % self.video.franchise) shutil.copy2(self.video.franchise.poster_local_URI, image_dest) else: shutil.copy2(self.video.franchise.poster_local_URI, self.path) log.debug('Organizing TV by series. New path: %s' % self.path) # path determination done, lets make sure it exists self._make_path(self.path) log.debug("Filename: %s" % self.video.title) if self.video.media_type == data.media_types[data.TV]: title_filename = "Episode %s: %s" % (self.video.episode_number, self.video.title) log.debug('Adding episode number to title: %s' % title_filename) else: title_filename = self.video.title video_destination = fs.generate_filename(self.path, title_filename, self.video_ext) log.debug("Destination: %s" % video_destination) shutil.move(videofile, video_destination) return video_destination
SECRET_KEY = YOUR_INFO_HERE PASSWORD_SALT = YOUR_INFO_HERE # ADMIN ADMIN_USERNAME = YOUR_INFO_HERE ADMIN_PASSWORD = YOUR_INFO_HERE # RECAPCHA FOR COMMENTS RECAPTCHA_USE_SSL = True RECAPTCHA_PUBLIC_KEY = YOUR_INFO_HERE RECAPTCHA_PRIVATE_KEY = YOUR_INFO_HERE RECAPTCHA_OPTIONS = YOUR_INFO_HERE (if needed) # DB DB_DRIVER = 'sqlite' DB_NAME = fjoin(data_dir, 'beerlog.db') DB_PROTOCOL = protocol # AWS AWS_ACCESS_KEY = YOUR_INFO_HERE AWS_SECRET_KEY = YOUR_INFO_HERE AWS_BUCKET_NAME = YOUR_INFO_HERE # IMAGES IMAGE_FULL_SIZE = 800.0 TEMP_UPLOAD_FOLDER = '/tmp/beerlog/' ALLOWED_EXTENSIONS = set(['jpg', 'jpeg', 'png', 'gif']) IMAGE_BASEPATH = YOUR_INFO_HERE # MISC DATE_FORMAT = "%Y-%m-%d"
def main(args): checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no) if isfile(checkpoint_path + "frontier_map.pt"): frontier_map = pickle.load( open(checkpoint_path + "frontier_map.pt", "rb")) else: raise Exception("checkpoint not found") count = 0 for file in os.listdir(args.cdp): path = fjoin(args.cdp, file) res = pickle.load(open(path, "rb")) url = res['docno'] inlinks = list(frontier_map[url].inlinks) count += 1 write_to_graph(url, inlinks, count) fo.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Arguments') parser.add_argument("--dir", type=str, default="./output/", help="") parser.add_argument("--ckp_no", type=int, default=40000, help="") args = parser.parse_args() # additional parse option args.cdp = fjoin(args.dir, "crawled") #cdp = crawled data path args.ckp = fjoin(args.dir, "checkpoint") # ckp = checkpoint main(args)
label = 0 return sent, annotation.polarity def generate_train_dataset(annotation_dir, sentence_dir): train_pd = pd.DataFrame() for parents, dirnames, filenames in os.walk(sentence_dir): for filename in filenames: annotation = fjoin(annotation_dir, filename[:-len('.cmp.txt')]+'.best.xml') print annotation annotation_pd = pd.read_csv(annotation) sentences_pd = pd.read_csv(fjoin(parents, filename)) for index, row in annotation_pd.iterrows(): sent, label = find_sentence(row, sentences_pd) train_pd = train_pd.append({'sent':sent, 'label':label}, ignore_index=True) logger.info(train_pd.columns) return train_pd if __name__ == '__main__': root = "/home/apple/best/data" fdir = fjoin(root, "source") sentence_dir = fjoin(root, "source_sentence") if not os.path.isdir(sentence_dir): os.makedirs(sentence_dir) #extract_sentence(fdir, sentence_dir) #annotation_dir = fjoin(root, 'parse_annotation') #train_pd = generate_train_dataset(annotation_dir, sentence_dir) #train_pd.to_csv(fjoin(root, 'train_all')) train_pd = pd.read_csv(fjoin(root, 'train_all')) logger.info(train_pd.shape) logger.info(train_pd.head())
import matplotlib.pyplot as plt #============================================================================== # Constants and folders #============================================================================== # screenshot folder screenshot_folder = "./screenshots/" # folder paths cropped_folder = "./cropped/" references_folder = "./references/" num_ref_folder = "./num_ref/" # file that stores the reference image filenames to item names ref_namemap_file = fjoin(references_folder, "ref_namemap.txt") # item frame coordinates in pixels upper_left_corner = (944, 540) upper_right_corner = (1016, 540) lower_left_corner = (943, 607) lower_right_corner = (1016, 612) # hotbar coordinates in pixels upper_left_corner_hotbar = (944, 772) # item frame measures tile_width = upper_right_corner[0] - upper_left_corner[0] tile_height = lower_right_corner[1] - upper_right_corner[1] # inventory size
def make_filename(movie_path, movie_name, extn): return fjoin(movie_path, "%s.%s" % (movie_name, extn))
def ROC_curve(lr, y_test): logger.info("Plot roc curve") from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt pred_probas = lr.predict_proba(test_vecs)[:, 1] fpr, tpr, _ = roc_curve(y_test, pred_probas) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='area=%.2f' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.show() if __name__ == '__main__': size, epoch_num = 400, 10 model_dir = '/home/apple/best/model' x_train, x_test, y_train, y_test, pos, neg = get_dataset() external_x_train, external_x_test, external_unsup_reviews, external_y_train, external_y_test = get_external_dataset( ) model_dm, model_dbow = train(pos, neg, x_train, x_test, external_x_train, external_x_test, external_unsup_reviews, size, epoch_num) model_dm = gensim.models.Doc2Vec.load(fjoin(model_dir, 'doc2vec_dm')) model_dbow = gensim.models.Doc2Vec.load(fjoin(model_dir, 'doc2vec_dbow')) train_vecs, test_vecs = get_vectors(model_dm, model_dbow) lr = Classifier(train_vecs, y_train, test_vecs, y_test) ROC_curve(lr, y_test)
def get_images_filename_in_folder(folder): files = [ fjoin(folder, f) for f in listdir(folder) if isfile(fjoin(folder, f)) and splitext(f)[1] == ".png" ] return files
def main(args): #es2 = Elasticsearch([{'host': 'localhost', 'port': 9200}]) es2 = Elasticsearch( "https://96aa4157ead74b5ca4926523b1d1994e.us-east-1.aws.found.io:9243", http_auth=('elastic', 'MrkfJ5hxIcCOzTMfOa1Nftzy')) #elasticsearch.helpers.reindex(es1, "church_data", args.out_index, query=None, target_client=None, # chunk_size=500, scroll='5m', scan_kwargs={}, bulk_kwargs={}) checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no) if isfile(checkpoint_path + "frontier_map.pt"): frontier_map = pickle.load( open(checkpoint_path + "frontier_map.pt", "rb")) else: raise Exception("checkpoint not found") filesadded = 0 filesupdated = 0 # Load all the pickles of the crawled data for file in os.listdir(args.cdp): path = fjoin(args.cdp, file) res = pickle.load(open(path, "rb")) url = res['docno'] inlinkData = list(frontier_map[url].inlinks) j_inlinks = json.dumps(inlinkData) logging.info("Checking for url {}".format(url)) #Finding if the url is in the merged index result = es2.get(index=args.out_index, id=url, ignore=404) if result['found'] is True: logging.info("inlinks from local {}".format( len(set((frontier_map[url].inlinks))))) logging.info("inlinks retrieved {}".format( len(set(result['_source']['inlinks'])))) existing_inlinks = json.loads(j_inlinks) retrieved_inlinks = json.loads(result['_source']['inlinks']) #merging the inlinks from both local and merged set and updating the inlinks final_inlinkset = merge_inlinks( [retrieved_inlinks, existing_inlinks]) logging.info("length of final list {}".format( len(final_inlinkset))) es2.update(index=args.out_index, id=url, doc_type=args.doc_type, body={"doc": { "inlinks": json.dumps(final_inlinkset) }}) filesupdated += 1 logging.info("doc updated for url {}".format(url)) else: # indexing the data for the url which doesn't match any url in merged data index logging.info("value of res in else {}: ".format(len(result))) title = res['head'] content = res['text'] inlinks = j_inlinks outlinkData = list(frontier_map[url].outlinks) outlinks = json.dumps(outlinkData) doc = { 'head': title, 'text': content, 'inlinks': inlinks, 'outlinks': outlinks } es2.index(index=args.out_index, id=url, body=doc, doc_type=args.doc_type) filesadded += 1 logging.info("doc added for url {}: ".format(url)) logging.info("doc added {} and updated {}: ".format( filesadded, filesupdated))
def open_db(appdirs): db_driver = 'sqlite' db_fn = fjoin(appdirs.user_data_dir, appdirs.appname+'.sqlite') connection_string = "%s://%s" % (db_driver, db_fn) connection = connectionForURI(connection_string) sqlhub.processConnection = connection
def get_files(): source_path = get_setting('source_path') file_list = [] for ext in VIDEO_EXTENSIONS: file_list += glob.glob(fjoin(source_path, '*.%s' % ext)) return file_list