Пример #1
0
def get_external_dataset():
    logger.info("Get_external_dataset")
    with open(fjoin(external_data, 'pos.txt'), 'r') as infile:
        pos_reviews = infile.readlines()

    with open(fjoin(external_data, 'neg.txt'), 'r') as infile:
        neg_reviews = infile.readlines()

    with open(fjoin(external_data, 'unsup.txt'), 'r') as infile:
        unsup_reviews = infile.readlines()

    logger.info("pos:{} neg:{} unsup:{}".format(len(pos_reviews),
                                                len(neg_reviews),
                                                len(unsup_reviews)))
    y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews))))

    x_train, x_test, y_train, y_test = train_test_split(np.concatenate(
        (pos_reviews, neg_reviews)),
                                                        y,
                                                        test_size=0.2)

    x_train = cleanText(x_train)
    x_test = cleanText(x_test)
    unsup_reviews = cleanText(unsup_reviews)

    x_train = labelizeReviews(x_train, 'EXTERNAL_TRAIN')
    x_test = labelizeReviews(x_test, 'EXTENAL_TEST')
    unsup_reviews = labelizeReviews(unsup_reviews, 'EXTENAL_UNSUP')
    return x_train, x_test, unsup_reviews, y_train, y_test
Пример #2
0
def extract_sentence(fdir, sdir):
	for parents, dirnames, filenames in os.walk(fdir):
		for filename in filenames:
			#if not filename == '1d16a571f14fb1032bc19e9314a46deb.cmp.txt':
			#	continue
			logger.info(filename)
			save_file = fjoin(sdir, filename)
			with open(fjoin(parents, filename)) as infile:
				file = [f.decode("utf-8") for f in infile.readlines()]
			file = decompose(''.join(file))
			file.to_csv(save_file)
Пример #3
0
def vendor(filename):
    if app.debug:
        if "vendor" in request.path:
            filename = fjoin('vendor', secure_filename(filename))
        else:
            filename = secure_filename(filename)

        with open(fjoin('client', 'src', 'js', filename)) as f:
            script = f.read()
        response = make_response(script)
        response.headers['Content-Type'] = 'application/javascript'
        return response
Пример #4
0
def generate_train_dataset(annotation_dir, sentence_dir):
	train_pd = pd.DataFrame()
	for parents, dirnames, filenames in os.walk(sentence_dir):
		for filename in filenames:
			annotation = fjoin(annotation_dir, filename[:-len('.cmp.txt')]+'.best.xml')
			print annotation
			annotation_pd = pd.read_csv(annotation)
			sentences_pd = pd.read_csv(fjoin(parents, filename))
			for index, row in annotation_pd.iterrows():
				sent, label = find_sentence(row, sentences_pd)
				train_pd = train_pd.append({'sent':sent, 'label':label}, ignore_index=True)
	logger.info(train_pd.columns)
	return train_pd
Пример #5
0
def train(pos,
          neg,
          x_train,
          x_test,
          external_x_train,
          external_x_test,
          external_unsup_reviews,
          size=400,
          epoch_num=10):
    logger.info("Train sentence model(dm, dbow)")
    model_dm = gensim.models.Doc2Vec(min_count=1, \
     window=10, \
     size=size, \
     sample=1e-3, \
     negative=5, \
     workers=6)
    model_dbow = gensim.models.Doc2Vec(min_count=1, \
     window=10, \
     size=size, \
     sample=1e-3, \
     negative=5, \
     dm=0, \
     workers=6)
    # use all words build vocab
    vocab_document = x_train + x_test + external_x_train + external_x_test + external_unsup_reviews
    model_dm.build_vocab(vocab_document)
    model_dbow.build_vocab(vocab_document)

    # repeat train, everytime break the sequence to improve accuracy
    tmp_x_train = x_train + x_test + external_x_train + external_x_test
    print tmp_x_train[1:2]
    for epoch in range(epoch_num):
        logger.info("train epoch {}".format(epoch))
        random.shuffle(tmp_x_train)
        model_dm.train(tmp_x_train)
        model_dbow.train(tmp_x_train)

    # train test dataset
    '''
	tmp_x_test = x_test
	for epoch in range(epoch_num):
		logger.info("test epoch {}".format(epoch))
		random.shuffle(tmp_x_test)
		model_dm.train(tmp_x_test)
		model_dbow.train(tmp_x_test)
	'''
    model_dm.save(fjoin(model_dir, 'doc2vec_dm'))
    model_dbow.save(fjoin(model_dir, 'doc2vec_dbow'))
    return model_dm, model_dbow
Пример #6
0
 def pps_calculator(self):
     pps_file_path = self.get_pps_file_path()
     with open(fjoin(pps_file_path, 'rx_packets')) as f:
         rx_origin = int(f.read())
     with open(fjoin(pps_file_path, 'tx_packets')) as f:
         tx_origin = int(f.read())
     sleep(5)
     with open(fjoin(pps_file_path, 'rx_packets')) as f:
         rx_now = int(f.read())
     with open(fjoin(pps_file_path, 'tx_packets')) as f:
         tx_now = int(f.read())
     rx_pps = (rx_now - rx_origin) / 5
     tx_pps = (tx_now - tx_origin) / 5
     self.pps = {'rx_pps': rx_pps, 'tx_pps': tx_pps}
     self.pps_timer_starter()
Пример #7
0
def main(args):
    checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no)

    if isfile(checkpoint_path + "frontier_map.pt"):
        frontier_map = pickle.load(
            open(checkpoint_path + "frontier_map.pt", "rb"))
    else:
        raise Exception("checkpoint not found")
    count = 0
    for file in os.listdir(args.cdp):
        path = fjoin(args.cdp, file)
        res = pickle.load(open(path, "rb"))
        url = res['docno']
        inlinks = list(frontier_map[url].inlinks)
        count += 1
        write_to_graph(url, inlinks, count)
    fo.close()
Пример #8
0
def get_video_filelist(basepath):
    videos = []
    for root, dirs, files in os.walk(basepath):
        for name in files:
            if name.split('.')[-1] in allowed_extensions:
                videos.append(fjoin(root,name))
    
    return videos
Пример #9
0
def make_list(basepath):
    allowed_extensions = ['m4v', 'mp4', 'mov', 'wmv']
    videos = []
    for root, dirs, files in os.walk(basepath):
        for name in files:
            if name.split('.')[-1] in allowed_extensions:
                videos.append(fjoin(root,name))
                
    return videos
Пример #10
0
def handle_imdb_dataset(sentiment):
    print "in"
    data_type = ['train', 'test']
    save_file = fjoin("/home/apple/best/external_data/aclImdb",
                      sentiment + ".txt")
    save_file_handler = open(save_file, 'w+')
    reviews = []
    for dt in data_type:
        imdb_dir = fjoin("/home/apple/best/external_data/aclImdb",
                         dt + "/" + sentiment)
        print imdb_dir
        for parent, dirnames, filenames in os.walk(imdb_dir):
            print len(filenames)
            for filename in filenames:
                print filename
                with open(fjoin(parent, filename), 'r') as infile:
                    reviews.append("\n".join(infile.readlines()) + "\n")
    save_file_handler.writelines(reviews)
    save_file_handler.close()
Пример #11
0
    def add_reference(self, extracted_num):
        plt.imshow(extracted_num)
        plt.show()

        isnum = input("is number: ")

        if isnum == "yes" or isnum == "y":
            n = input("which number: ")

            filename = "reference_num_" + str(n) + ".png"

            extracted_num.save(fjoin(num_ref_folder, filename))
def remove_office_duplicates(file_path):

    current_folder = getcwd()

    if not ("/" in file_path or "\\" in file_path):
        file_path = fjoin(current_folder, file_path)

    if not fexists(file_path) or not isfile(file_path):
        print "File does not exist: %s" % file_path
        return False

    print "\nFixing: %s" % file_path


    with open(file_path, 'r+') as the_f:
        data = the_f.read()
        soup = BeautifulSoup(data, "lxml")

        divs_list = soup.find_all('div')

        # import ipdb
        # ipdb.set_trace()

        duplicate_total_num = 0

        prev_div = None

        for div in divs_list:
            # print len(div.contents)
            try:
                if prev_div['style'] == div['style']:
                    prev_div.replace_with(div)
                    duplicate_total_num+=1
                    
                # print div['style']
            except (KeyError, TypeError):
                pass

            prev_div=div

        
        if duplicate_total_num:
            print "%s duplicate borders are removed" % duplicate_total_num

            the_f.seek(0)
            the_f.write(str(soup))
            the_f.truncate()

        else:
            print "No duplicate borders were found."
Пример #13
0
def main(args):
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

    #create the settings and mapping of the index
    create_index(args.index, es)

    checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no)

    if isfile(checkpoint_path + "frontier.pt"):

        frontier = pickle.load(open(checkpoint_path + "frontier.pt", "rb"))
        frontier_map = pickle.load(
            open(checkpoint_path + "frontier_map.pt", "rb"))
        id_to_url = pickle.load(open(checkpoint_path + "id_to_url.pt", "rb"))
        links_crawled = pickle.load(
            open(checkpoint_path + "links_crawled.pt", "rb"))
        current_wave = pickle.load(
            open(checkpoint_path + "current_wave.pt", "rb"))
    else:
        raise Exception("checkpoint not found")

    #Load all the pickles of the crawled data
    for file in os.listdir(args.cdp):
        path = fjoin(args.cdp, file)
        res = pickle.load(open(path, "rb"))

        url = res['docno']
        title = res['head']
        content = res['text']
        inlinkData = list(frontier_map[url].inlinks)
        outlinkData = list(frontier_map[url].outlinks)
        print("inlink data : ", inlinkData)
        inlinks = json.dumps(inlinkData)
        outlinks = json.dumps(outlinkData)
        print("inlinks after json dumping : ", inlinks)
        store_in_ES(args.index, url, title, content, inlinks, outlinks, es)
Пример #14
0
def config(file):
    root = os.path.splitext(os.path.basename(file))[0]
    # config logger, create 2 handler(file, console)
    logger = logging.getLogger('BEST.{}'.format(root))
    logger_fh = logging.FileHandler(fjoin('logs', 'BEST-{}.log'.format(root)))
    logger_ch = logging.StreamHandler()
    logger_formatter = logging.Formatter(
        '[%(levelname)s] %(asctime)s %(filename)s [line:%(lineno)d]: %(message)s'
    )
    logger_fh.setFormatter(logger_formatter)
    logger_ch.setFormatter(logger_formatter)
    logger.addHandler(logger_fh)
    logger.addHandler(logger_ch)
    logger.setLevel(logging.DEBUG)

    return logger
Пример #15
0
    def generate_image(self, local_path, local_title, remote_url):
        local_file = fjoin(local_path, local_title)
        try:
            os.stat(local_file)
        except OSError:
            try:
                log.debug("Source: %s" % remote_url)
                log.debug("Dest: %s" % local_file)
                fs.download_file(remote_url, local_file)
            except OSError:
                log.critical("Can't open %s for writing." % local_file)
                print "Can't open %s for writing." % local_file
                sys.exit(1)

            return local_file
        else:
            return local_file
Пример #16
0
def init_log(level, appdirs):
    levels = {'debug': logging.DEBUG, 'info': logging.INFO}
    log_filename = fjoin(appdirs.user_log_dir, "%s.log" % appdirs.appname)               
    msg_fmt = '[%(asctime)s] %(name)-12s %(levelname)-8s %(message)s'
    date_fmt = '%m/%d %H:%M'
    level = levels.get(level, logging.NOTSET)
    
    try:
        logging.basicConfig(level=level,
                            format=msg_fmt,
                            datefmt=date_fmt,
                            filename=LOG_FILENAME,
                            filemode='w')
    except IOError:
        logging.basicConfig(level=level,
                            format=msg_fmt,
                            datefmt=date_fmt,
                            stream=sys.stderr)
Пример #17
0
def read_data(dir, savepath=''):
    data = {}

    if os.path.isfile(savepath):
        with open(savepath, "rb") as handle:
            data = pickle.load(handle)
        return data

    files = os.listdir(dir)
    for file in tqdm.tqdm(files):
        path = fjoin(dir, file)
        text = open(path, 'r', encoding='ISO-8859-1').read()
        subject, body_text = get_text_from_email(text)
        data[file] = [subject, body_text]

    with open(savepath, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return data
Пример #18
0
    def add_new_reference(self, item):
        plt.imshow(item)
        plt.show()

        self.incremental_ref_num += 1

        filename = fjoin(references_folder,
                         f"reference_{self.incremental_ref_num}.png")

        name = input("Give item name:\n> ")

        newir = ItemReference(filename, name, item)

        newir.image.save(newir.filename)

        self.references.append(newir)

        self.save_reference_file()

        return newir
Пример #19
0
 def rewind(self):
     try:
         media = list(Media.select())
     except SQLObjectNotFound:
         msg = 'No media found with which to rewind'
         log.info(msg)
         sys.exit(msg)
     else:                
         for medium in media:
             if medium.file_URI:                    
                 if medium.original_file_URI:
                     log.debug('Moving: %s to %s' % (medium.file_URI, 
                                                 medium.original_file_URI))
                     shutil.move(medium.file_URI, medium.original_file_URI)
                 else:
                     log.debug('Original file location does not exist')
                     source_path = data.get_setting('source_path')
                     media_directory = medium.media_type
                     try:
                         log.debug("Franchise: %s" % medium.franchise.name)
                         new_title = medium.franchise.name
                     except SQLObjectNotFound:
                         log.debug('No franchise: %s' % medium.title)                            
                         new_title = medium.title
                     if medium.media_type == data.media_types[data.TV]:
                         filename = '%s S%sE%s.%s' % (new_title, 
                                                     medium.season_number,
                                                     medium.episode_number,
                                                     medium.codec)
                     else:
                         filename = '%s.%s' % (new_title, medium.codec)
                     dest = fjoin(source_path, media_directory, filename)
                     log.debug('Moving: %s to %s' % (medium.file_URI, dest))
                     shutil.move(medium.file_URI, dest)
                     medium.file_URI = dest
             else:
                 msg = "%s can't be rewound." % medium.title
                 log.error(msg)
                 pub.sendMessage('STD_OUT', msg=msg)
Пример #20
0
def index():
    if app.debug:
        with open(fjoin('client', 'src', 'index.html')) as f:
            index = f.read()
        return index
Пример #21
0
 def get_pps_file_path(self):
     basepath = '/sys/class/net'
     for flist in listdir(basepath):
         if flist[0] == 'e':
             return fjoin(basepath, flist, 'statistics')
Пример #22
0
    def process_files(self):
        filelist = fs.make_list(fs.get_basepath(data.get_setting('source_path')))
        self.org_type = data.get_setting('organization_method')

        for videofile in filelist:
            original_file_location = videofile
            if not self.exists_in_db(videofile):                
                (path, video_filename, self.video_ext) = fs.fn_to_parts(videofile)

                # what are we looking up? tv? movie?
                if data.Media.media_types[data.Media.MOVIES].lower() in path.lower():
                    self.lookup_movie(video_filename)
                elif data.Media.media_types[data.Media.TV].lower() in path.lower():
                    self.lookup_tv(video_filename)
                else:
                    log.critical("Sorry, I can't figure out how your video files are organized")
                    print "Sorry, I can't figure out how your video files are organized"
                    sys.exit(1)

                # were there multiple results for this?    
                if len(self.results) > 1 and not self.options.first:
                    selected = self.resolve_multiple_results(video_filename, self.results)
                    result = self.results[selected]
                    process_vid = True
                elif len(self.results) == 1 or self.options.first:
                    result = self.results[0]
                    process_vid = True
                else:
                    log.debug("No matches, skipping file")
                    process_vid = False

                if process_vid:
                    log.debug("Result: %s" % result.title)

                    self.video = data.Media()
                    self.video.fromAPIMedia(result)
            else:
                process_vid = True

            if process_vid:
                # should we organize?
                if data.get_setting('master_org'):
                    self.video.file_URI = self.organize_file(videofile)
                    self.video.original_file_URI = original_file_location
                else:
                    self.video.file_URI = videofile
                    self.video.original_file_URI = videofile

                # process the image for the video
                poster_filename = "%s.jpg" % self.get_filename_base(self.video.file_URI)
                if self.video.poster_remote_URI:
                    self.generate_image(self.path, poster_filename, self.video.poster_remote_URI)
                elif self.video.media_type == data.media_types[data.TV] and self.folder_poster:
                    shutil.copy2(self.video.franchise.poster_local_URI, fjoin(self.path, poster_filename))

                # process the xml for the video if we're making individual
                # videofiles.  if not, we'll process it all at the end
                if self.org_type == 'xml':
                    self.generate_videoxml(self.path, self.video)

                try:
                    del self.results
                    del result
                    del self.video
                except AttributeError:
                    pass

        # we are going to generate a master video xml file containing all
        # entries
        if self.org_type == 'dir':
            self.generate_video_directory()
Пример #23
0
    def organize_file(self, videofile):
            self.path = data.get_setting('dest_path')
            movies_by_genre = data.get_setting('movies_by_genre')
            tv_by_genre = data.get_setting('tv_series_by_genre')
            tv_by_series = data.get_setting('tv_series_by_series')
            log.debug("Path: %s" % self.path)
            mt = self.video.media_type
            tv = data.media_types[data.TV]
            movies = data.media_types[data.MOVIES]

            if self.video.media_type not in self.path:
                self.path = fjoin(self.path, self.video.media_type)
                log.debug("Missing media type in path. New path: %s" % self.path)

            if mt == movies:
                log.debug("MOVIES")
                if movies_by_genre:
                    self.path = fjoin(self.path, self.clean_name_for_fs(self.video.genres[0].name))
                    log.debug("Organizing movies by genre. New path: %s" % self.path)

            elif mt == tv:
                log.debug("TV SHOWS")
                if tv_by_genre:
                    self.path = fjoin(self.path, self.clean_name_for_fs(self.video.genres[0].name))
                    log.debug('Organizing TV by genre. New path: %s' % self.path)

                if tv_by_series:
                    # series level directory
                    self.path = fjoin(self.path, self.clean_name_for_fs(self.video.franchise.name))
                    self._make_path(self.path)
                    if self.org_type == 'xml':
                        # for videoxml, the images need to be same name as the
                        # objects they represent
                        (image_path, image_filename) = self.path.rsplit('/',1)
                        image_filename += '.jpg'
                        self.folder_poster = self.generate_image(image_path, image_filename, self.video.franchise.poster_remote_URI)
                        log.debug("Local poster URI: %s" % self.folder_poster)
                        self.video.franchise.poster_local_URI = self.folder_poster
                    else:
                        self.folder_poster = self.generate_image(self.path, 'poster.jpg', self.video.franchise.poster_remote_URI)
                        log.debug("Local poster URI: %s" % self.folder_poster)                        
                        self.video.franchise.poster_local_URI = self.folder_poster

                    log.debug("Adding franchise. New path: %s" % self.path)
                    log.debug("Adding poster image %s" % self.folder_poster)

                    # season level directory
                    season = "Season %s" % self.video.season_number
                    self.path = fjoin(self.path, season)
                    self._make_path(self.path)
                    if self.org_type == 'xml':
                        image_dest = self.path+".jpg"
                        log.debug("Franchise: %s" % self.video.franchise)
                        shutil.copy2(self.video.franchise.poster_local_URI, image_dest)
                    else:
                        shutil.copy2(self.video.franchise.poster_local_URI, self.path)

                    log.debug('Organizing TV by series. New path: %s' % self.path)

            # path determination done, lets make sure it exists
            self._make_path(self.path)
            log.debug("Filename: %s" % self.video.title)
            if self.video.media_type == data.media_types[data.TV]:
                title_filename = "Episode %s: %s" % (self.video.episode_number, self.video.title)
                log.debug('Adding episode number to title: %s' % title_filename)
            else:
                title_filename = self.video.title
            video_destination = fs.generate_filename(self.path, title_filename, self.video_ext)
            log.debug("Destination: %s" % video_destination)
            shutil.move(videofile, video_destination)
            return video_destination
Пример #24
0
SECRET_KEY = YOUR_INFO_HERE
PASSWORD_SALT = YOUR_INFO_HERE

# ADMIN
ADMIN_USERNAME = YOUR_INFO_HERE
ADMIN_PASSWORD = YOUR_INFO_HERE

# RECAPCHA FOR COMMENTS
RECAPTCHA_USE_SSL = True
RECAPTCHA_PUBLIC_KEY = YOUR_INFO_HERE
RECAPTCHA_PRIVATE_KEY = YOUR_INFO_HERE
RECAPTCHA_OPTIONS = YOUR_INFO_HERE (if needed)

# DB
DB_DRIVER = 'sqlite'
DB_NAME = fjoin(data_dir, 'beerlog.db')
DB_PROTOCOL = protocol

# AWS
AWS_ACCESS_KEY = YOUR_INFO_HERE
AWS_SECRET_KEY = YOUR_INFO_HERE
AWS_BUCKET_NAME = YOUR_INFO_HERE

# IMAGES
IMAGE_FULL_SIZE = 800.0
TEMP_UPLOAD_FOLDER = '/tmp/beerlog/'
ALLOWED_EXTENSIONS = set(['jpg', 'jpeg', 'png', 'gif'])
IMAGE_BASEPATH = YOUR_INFO_HERE

# MISC
DATE_FORMAT = "%Y-%m-%d"
Пример #25
0
def main(args):
    checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no)

    if isfile(checkpoint_path + "frontier_map.pt"):
        frontier_map = pickle.load(
            open(checkpoint_path + "frontier_map.pt", "rb"))
    else:
        raise Exception("checkpoint not found")
    count = 0
    for file in os.listdir(args.cdp):
        path = fjoin(args.cdp, file)
        res = pickle.load(open(path, "rb"))
        url = res['docno']
        inlinks = list(frontier_map[url].inlinks)
        count += 1
        write_to_graph(url, inlinks, count)
    fo.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Arguments')
    parser.add_argument("--dir", type=str, default="./output/", help="")
    parser.add_argument("--ckp_no", type=int, default=40000, help="")

    args = parser.parse_args()

    # additional parse option
    args.cdp = fjoin(args.dir, "crawled")  #cdp = crawled data path
    args.ckp = fjoin(args.dir, "checkpoint")  # ckp = checkpoint
    main(args)
Пример #26
0
		label = 0
	return sent, annotation.polarity

def generate_train_dataset(annotation_dir, sentence_dir):
	train_pd = pd.DataFrame()
	for parents, dirnames, filenames in os.walk(sentence_dir):
		for filename in filenames:
			annotation = fjoin(annotation_dir, filename[:-len('.cmp.txt')]+'.best.xml')
			print annotation
			annotation_pd = pd.read_csv(annotation)
			sentences_pd = pd.read_csv(fjoin(parents, filename))
			for index, row in annotation_pd.iterrows():
				sent, label = find_sentence(row, sentences_pd)
				train_pd = train_pd.append({'sent':sent, 'label':label}, ignore_index=True)
	logger.info(train_pd.columns)
	return train_pd

if __name__ == '__main__':
	root = "/home/apple/best/data"
	fdir = fjoin(root, "source")
	sentence_dir = fjoin(root, "source_sentence")
	if not os.path.isdir(sentence_dir):
		os.makedirs(sentence_dir)
	#extract_sentence(fdir, sentence_dir)

	#annotation_dir = fjoin(root, 'parse_annotation')
	#train_pd = generate_train_dataset(annotation_dir, sentence_dir)
	#train_pd.to_csv(fjoin(root, 'train_all'))
	train_pd = pd.read_csv(fjoin(root, 'train_all'))
	logger.info(train_pd.shape)
	logger.info(train_pd.head())
Пример #27
0
import matplotlib.pyplot as plt

#==============================================================================
# Constants and folders
#==============================================================================

# screenshot folder
screenshot_folder = "./screenshots/"

# folder paths
cropped_folder = "./cropped/"
references_folder = "./references/"
num_ref_folder = "./num_ref/"

# file that stores the reference image filenames to item names
ref_namemap_file = fjoin(references_folder, "ref_namemap.txt")

# item frame coordinates in pixels
upper_left_corner = (944, 540)
upper_right_corner = (1016, 540)
lower_left_corner = (943, 607)
lower_right_corner = (1016, 612)

# hotbar coordinates in pixels
upper_left_corner_hotbar = (944, 772)

# item frame measures
tile_width = upper_right_corner[0] - upper_left_corner[0]
tile_height = lower_right_corner[1] - upper_right_corner[1]

# inventory size
Пример #28
0
def make_filename(movie_path, movie_name, extn):
    return fjoin(movie_path, "%s.%s" % (movie_name, extn))
Пример #29
0
def ROC_curve(lr, y_test):
    logger.info("Plot roc curve")
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt
    pred_probas = lr.predict_proba(test_vecs)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, pred_probas)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='area=%.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])

    plt.show()


if __name__ == '__main__':
    size, epoch_num = 400, 10
    model_dir = '/home/apple/best/model'

    x_train, x_test, y_train, y_test, pos, neg = get_dataset()
    external_x_train, external_x_test, external_unsup_reviews, external_y_train, external_y_test = get_external_dataset(
    )
    model_dm, model_dbow = train(pos, neg, x_train, x_test, external_x_train,
                                 external_x_test, external_unsup_reviews, size,
                                 epoch_num)
    model_dm = gensim.models.Doc2Vec.load(fjoin(model_dir, 'doc2vec_dm'))
    model_dbow = gensim.models.Doc2Vec.load(fjoin(model_dir, 'doc2vec_dbow'))
    train_vecs, test_vecs = get_vectors(model_dm, model_dbow)
    lr = Classifier(train_vecs, y_train, test_vecs, y_test)
    ROC_curve(lr, y_test)
Пример #30
0
def get_images_filename_in_folder(folder):
    files = [
        fjoin(folder, f) for f in listdir(folder)
        if isfile(fjoin(folder, f)) and splitext(f)[1] == ".png"
    ]
    return files
Пример #31
0
def main(args):
    #es2 = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    es2 = Elasticsearch(
        "https://96aa4157ead74b5ca4926523b1d1994e.us-east-1.aws.found.io:9243",
        http_auth=('elastic', 'MrkfJ5hxIcCOzTMfOa1Nftzy'))

    #elasticsearch.helpers.reindex(es1, "church_data", args.out_index, query=None, target_client=None,
    #           chunk_size=500, scroll='5m', scan_kwargs={}, bulk_kwargs={})

    checkpoint_path = fjoin(args.ckp, "checkpoint.%d." % args.ckp_no)
    if isfile(checkpoint_path + "frontier_map.pt"):
        frontier_map = pickle.load(
            open(checkpoint_path + "frontier_map.pt", "rb"))
    else:
        raise Exception("checkpoint not found")

    filesadded = 0
    filesupdated = 0
    # Load all the pickles of the crawled data
    for file in os.listdir(args.cdp):
        path = fjoin(args.cdp, file)
        res = pickle.load(open(path, "rb"))
        url = res['docno']

        inlinkData = list(frontier_map[url].inlinks)
        j_inlinks = json.dumps(inlinkData)
        logging.info("Checking for url {}".format(url))

        #Finding if the url is in the merged index
        result = es2.get(index=args.out_index, id=url, ignore=404)

        if result['found'] is True:
            logging.info("inlinks from local  {}".format(
                len(set((frontier_map[url].inlinks)))))
            logging.info("inlinks retrieved {}".format(
                len(set(result['_source']['inlinks']))))
            existing_inlinks = json.loads(j_inlinks)
            retrieved_inlinks = json.loads(result['_source']['inlinks'])

            #merging the inlinks from both local and merged set and updating the inlinks
            final_inlinkset = merge_inlinks(
                [retrieved_inlinks, existing_inlinks])
            logging.info("length of final list {}".format(
                len(final_inlinkset)))
            es2.update(index=args.out_index,
                       id=url,
                       doc_type=args.doc_type,
                       body={"doc": {
                           "inlinks": json.dumps(final_inlinkset)
                       }})
            filesupdated += 1
            logging.info("doc updated for url {}".format(url))

        else:
            # indexing the data for the url which doesn't match any url in merged data index
            logging.info("value of res in else {}: ".format(len(result)))
            title = res['head']
            content = res['text']
            inlinks = j_inlinks
            outlinkData = list(frontier_map[url].outlinks)
            outlinks = json.dumps(outlinkData)
            doc = {
                'head': title,
                'text': content,
                'inlinks': inlinks,
                'outlinks': outlinks
            }
            es2.index(index=args.out_index,
                      id=url,
                      body=doc,
                      doc_type=args.doc_type)
            filesadded += 1
            logging.info("doc added for url {}: ".format(url))

    logging.info("doc added {} and updated {}: ".format(
        filesadded, filesupdated))
Пример #32
0
def open_db(appdirs):
    db_driver = 'sqlite'
    db_fn = fjoin(appdirs.user_data_dir, appdirs.appname+'.sqlite')
    connection_string = "%s://%s" % (db_driver, db_fn)
    connection = connectionForURI(connection_string)
    sqlhub.processConnection = connection                            
Пример #33
0
def get_files():
    source_path = get_setting('source_path')
    file_list = []
    for ext in VIDEO_EXTENSIONS:
        file_list += glob.glob(fjoin(source_path, '*.%s' % ext))
    return file_list