def parse_blog(path):
    if is_file_exist(path + '/linked_papers.json'):
        return
    if not is_file_exist(path + '/urls.json'):
        return

    with open(path + '/urls.json') as f:
        urls = json.load(f)

    output = []
    dir_path = path + '/linked_papers.json'

    for url in urls:
        output.append(crawl_paper_links(url))
        write_to_json_file(dir_path, output)
示例#2
0
def count_urls(file):
    if is_file_exist(file):
        with open(file) as f:
            urls = json.load(f)
            global TOTAL_COUNT, UNIQUE_COUNT, MAP
            TOTAL_COUNT += len(urls)
            for url in urls:
                MAP[url] = 1  #hashset, 1 is meaningless
                UNIQUE_COUNT += 1
示例#3
0
def main(argv):
    # command line arguments
    opts, args = getopt.getopt(argv, "hi:s:", ["ifile=", "start="])
    # input file pointer
    inputfile = None
    # default value for start idx
    start = 0
    for opt, arg in opts:
        if opt == '-h':
            print('insertion.py -i <input_file> -s <start_idx>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-s", "--start"):
            try:
                start = int(arg)
            except:
                print("Parameter start should be int!")
                sys.exit()

    if (inputfile is not None and utils.is_file_exist(inputfile)):
        # init the worker class
        w = thread.init_worker(concurrent=config.concurrent,
                               timeout=config.timeout)

        with open(inputfile, 'r', buffering=config.read_buffer) as f:
            data = []
            idx, c = 0, 0
            for l in f:
                if (idx >= start):
                    try:
                        dpoint = json.loads(l)
                    except:
                        continue

                    c += 1

                    data.append(dpoint)
                    if (c == config.partition):
                        print(f"insert_{idx - config.partition}-{idx}")
                        thread.run_worker(
                            w, f"insert_{idx - config.partition}-{idx}", data,
                            parse)
                        # clear
                        c = 0
                        data = []
                idx += 1
            if (len(data)):
                print(f"insert_{idx-len(data)}-{idx}")
                thread.run_worker(w, f"insert_{idx-len(data)}-{idx}", data,
                                  parse)
    else:
        print("[WARNING] File not exists or not specified!")
示例#4
0
 def __init__(self,
              data_dir,
              dataset,
              train=True,
              cv_pretrained=True,
              transform=None,
              size=(224, 224),
              top_k=0,
              multi_label=False,
              tokenizer=None,
              text_max=14):
     self.dataset = dataset
     self.mode = 'train' if train else 'val'
     self.cv_pretrained = cv_pretrained
     self.transform = transform
     self.multi_label = multi_label
     self.text_max = text_max
     self.data_file = os.path.join(data_dir, dataset,
                                   f'data_dict_{top_k}_{multi_label}.pkl')
     self.question_file = os.path.join(
         data_dir, dataset,
         f'questions_{self.mode}_{top_k}_{multi_label}.h5')
     if self.cv_pretrained:
         self.image_dir = os.path.join(
             data_dir, dataset, f'images_{self.mode}_{str(size[0])}.h5')
         self.idx_dict_file = os.path.join(data_dir, dataset,
                                           'idx_dict.pkl')
     else:
         if dataset == 'clevr' or dataset == 'sample':
             self.image_dir = os.path.join(data_dir, dataset, 'images',
                                           f'{self.mode}')
         elif dataset == 'vqa2':
             self.image_dir = os.path.join(data_dir, dataset,
                                           f'{self.mode}2014')
     if not is_file_exist(self.question_file):
         make_questions(data_dir, dataset, top_k, multi_label, tokenizer)
     if cv_pretrained:
         if not is_file_exist(self.image_dir):
             make_images(data_dir, dataset, size)
     self.load_data()
示例#5
0
def count_linked_papers(file):
    if is_file_exist(file):
        with open(file) as f:
            try:
                data = json.load(f)
                global TOTAL_COUNT, UNIQUE_COUNT, MAP
                for each in data:
                    TOTAL_COUNT += len(each['papers'])
                    if not each['url'] in MAP:
                        MAP[each['url']] = len(each['papers'])
                        UNIQUE_COUNT += len(each['papers'])

            except:
                print 'except'
示例#6
0
 def __init__(self, data_dir, dataset, train=True, transform=None,
              size=(224, 224), object_size=14, cv_pretrained=True, top_k=0,
              multi_label=False, q_tokenizer='none', a_tokenizer='none',
              question_inverse=False, text_max=14, te_bert=False):
     self.dataset = dataset
     self.mode = 'train' if train else 'val'
     self.transform = transform
     self.cv_pretrained = cv_pretrained
     self.top_k = top_k
     self.multi_label = multi_label
     self.label = 'multi-label' if multi_label else 'uni-label'
     self.q_tokenizer = q_tokenizer
     self.a_tokenizer = a_tokenizer
     self.question_inverse = question_inverse
     self.text_max = text_max
     if not te_bert:
         self.qa_file = os.path.join(data_dir, dataset, f'qa_sets_{dataset}_{self.mode}.h5')
         if not is_file_exist(self.qa_file):
             make_questions(data_dir, dataset)
     else:
         self.qa_file = os.path.join(data_dir, dataset, f'qa_sets_{dataset}_{self.mode}_bert.h5')
         if not is_file_exist(self.qa_file):
             make_bert(data_dir, dataset)
     if cv_pretrained:
         self.image_dir = os.path.join(data_dir, dataset, f'images_{self.mode}_{str(size[0])}_{object_size}.h5')
         if not is_file_exist(self.image_dir):
             make_images(data_dir, dataset, size)
         # self.idx_dict_file = os.path.join(data_dir, dataset, 'idx_dict.pkl')
     else:
         if dataset == 'vqa2':
             self.image_dir = os.path.join(data_dir, dataset, f'{self.mode}2014')
         elif dataset == 'clevr' or dataset == 'clevr-humans':
             self.image_dir = os.path.join(data_dir, 'clevr', f'images_{self.mode}_{str(size[0])}_raw.h5')
             if not is_file_exist(self.image_dir):
                 make_images(data_dir, dataset, size)
     self.load_data()
示例#7
0
def count_linked_arxiv_papers(file):
    if is_file_exist(file):
        with open(file) as f:
            try:
                data = json.load(f)
                global TOTAL_COUNT, UNIQUE_COUNT, MAP
                for each in data:
                    arxiv_count = 0
                    for paper in each['papers']:
                        if 'arxiv' in paper:
                            arxiv_count += 1
                    TOTAL_COUNT += arxiv_count
                    if not each['url'] in MAP:
                        MAP[each['url']] = arxiv_count
                        UNIQUE_COUNT += arxiv_count
            except:
                print 'except'
示例#8
0
 def __init__(self, writer, args, batch_record_idx=0):
     self.writer = writer
     self.args = args
     self.timestamp = args.timestamp
     self.idx_to_question_type = args.idx_to_question_type
     self.idx_to_word = args.idx_to_word
     self.answer_idx_to_word = args.answer_idx_to_word
     self.qt_size = args.qt_size
     self.multi_label = args.multi_label
     self.batch_record_idx = batch_record_idx
     self.csv_file = os.path.join(args.log_directory, args.project,
                                  f"{args.project}_log.csv")
     self.rolling_average = 5
     self.logs = defaultdict(lambda: deque(maxlen=self.rolling_average))
     self.epoch_idx = None
     self.mode = None
     self.batch_num = 0
     self.dataset_size = 0
     self.epoch_loss = 0
     self.epoch_correct = 0
     self.per_question = None
     self.per_question_type = None
     self.epoch_start_time = 0
     self.epoch_end_time = 0
     self.epoch_time = 0
     self.batch_loss = 0
     self.batch_correct = 0
     self.batch_start_time = 0
     self.batch_end_time = 0
     self.batch_time = 0
     self.per_question_log = dict()
     self.exclude = [
         'data_directory', 'log_directory', 'data_config', 'config', 'log',
         'word_to_idx', 'idx_to_word', 'answer_word_to_idx',
         'answer_idx_to_word', 'question_type_to_idx',
         'idx_to_question_type', 'q_size', 'a_size', 'qt_size'
     ]
     self.header = self.make_header()
     if not is_file_exist(self.csv_file):
         self.make_csv()
     if not self.is_record_exist():
         self.make_record()