Exemplo n.º 1
0
    def load_data_to(self, ctxs: Dict[object, BiEncoderPassage]):

        tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

        if self.n_random_papers:
            print("Random newspaper subset...")
            scan_names = []
            for file_path in tqdm(self.file_paths):
                with open(file_path, 'rb') as f:
                    items = ijson.kvitems(f, '')
                    for k, v in items:
                        scan_names.append(k)
            papers = list(set([self.get_paper_name(scan) for scan in scan_names]))
            papers.sort()
            print(f"{len(papers)} total papers...")

            random.seed(789)
            random_papers = random.sample(papers, self.n_random_papers)
            print(f"Selected random papers: {random_papers}")

        print("Creating bi-encoder dict...")
        for file_path in tqdm(self.file_paths):

            with open(file_path, 'rb') as f:
                items = ijson.kvitems(f, '')
                ocr_text_generators = []
                for k, v in items:
                    if self.month_str:
                        if self.month_str in k:
                            if self.n_random_papers:
                                if self.get_paper_name(k) in random_papers:
                                    ocr_text_generators.append(self.ocr_text_iter(v))
                            else:
                                ocr_text_generators.append(self.ocr_text_iter(v))
                    else:
                        if self.n_random_papers:
                            if self.get_paper_name(k) in random_papers:
                                ocr_text_generators.append(self.ocr_text_iter(v))
                        else:
                            ocr_text_generators.append(self.ocr_text_iter(v))

            if len(ocr_text_generators) == 0:
                continue

            for gen in ocr_text_generators:
                for layobj in gen:
                    title, passage, object_id = layobj
                    uid = object_id
                    if self.normalize:
                        title = normalize_passage(title)
                        title = title.lower()
                        passage = take_max_model_paragraphs(passage, tokenizer)
                        passage = normalize_passage(passage)
                    ctxs[uid] = BiEncoderPassage(passage, title)
Exemplo n.º 2
0
    def create_database(self) -> DB:
        self._pre_run()

        with open(
            self._all_cards_path, 'r', encoding = 'UTF-8'
        ) as all_cards_file, open(
            self._all_sets_path, 'r', encoding = 'UTF-8'
        ) as all_sets_file:
            handler = logging.FileHandler(self._logging_path, mode = 'w')
            parse_logger.addHandler(handler)

            try:
                raw_cards = ijson.kvitems(all_cards_file, 'data')

                cards = self.create_card_table(raw_cards)

                all_cards_file.seek(0)
                raw_cards = ijson.kvitems(all_cards_file, 'data')

                cardboards = self.create_cardboard_table(raw_cards, cards)

                artists = self.create_table_for_model(self._model_parser_map[i.Artist])
                blocks = self.create_table_for_model(self._model_parser_map[i.Block])
                printings = self.create_table_for_model(self._model_parser_map[i.Printing])

                raw_expansions = ijson.kvitems(all_sets_file, 'data')

                expansions = self.create_expansion_table(
                    raw_expansions = raw_expansions,
                    cardboards = cardboards,
                    printings = printings,
                    artists = artists,
                    blocks = blocks,
                )

                return self._create_database_from_tables(
                    {
                        'cards': cards,
                        'cardboards': cardboards,
                        'printings': printings,
                        'artists': artists,
                        'blocks': blocks,
                        'expansions': expansions,
                    }
                )

            finally:
                parse_logger.removeHandler(handler)
Exemplo n.º 3
0
    def load_data_to(self, ctxs: Dict[object, BiEncoderPassage], date):

        year = "_" + str(datetime.strptime(date, "%b-%d-%Y").year) + "_"

        tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")

        print(f"Creating bi-encoder dict for {date}...")
        for file_path in tqdm(self.file_paths):

            if year in file_path:
                with open(file_path, 'rb') as f:
                    items = ijson.kvitems(f, '')
                    ocr_text_generators = []
                    for k, v in items:
                        if date in k:
                            ocr_text_generators.append(self.ocr_text_iter(v))

                if len(ocr_text_generators) == 0:
                    continue

                for gen in ocr_text_generators:
                    for layobj in gen:
                        title, passage, object_id = layobj
                        uid = object_id
                        title = normalize_passage(title)
                        title = title.lower()
                        passage = take_max_model_paragraphs(passage, tokenizer)
                        passage = normalize_passage(passage)
                        ctxs[uid] = BiEncoderPassage(passage, title)
def main(filename):
    with open(filename, 'r') as file:
        objects = ijson.kvitems(file, 'wellFormedAnswers')
        valid_old_key_to_new_key = {}
        new_key = 0
        for key, well_formed_answer in objects:
            value = well_formed_answer if isinstance(well_formed_answer, list) else literal_eval(well_formed_answer)
            if len(value) > 0:
                valid_old_key_to_new_key[key] = str(new_key)
                new_key += 1
        filtered_data = {}
        fieldnames = ['query', 'query_type', 'answers', 'wellFormedAnswers', 'passages']
        for fieldname in fieldnames:
            add_data(filename, filtered_data, fieldname, valid_old_key_to_new_key)

    with open(filename, 'w') as fw:
        json.dump(filtered_data, fw)
Exemplo n.º 5
0
    def load_data_to(self, ctxs: Dict[object, BiEncoderPassage]):
        for file_path in self.file_paths:
            with open(file_path, 'rb') as f:
                items = ijson.kvitems(f, '')
                ocr_text_generators = [
                    ((ik['image_file_name'], ik['ocr_text'], ik['object_id']) 
                        for ik in v if ik['label']=='article')
                    for k, v in items
                ]

            for gen in ocr_text_generators:
                for layobj in gen:
                    title, passage, object_id = layobj
                    uid = str(object_id) + '_' + title 
                    if self.normalize:
                        passage = normalize_passage(passage)
                    ctxs[uid] = BiEncoderPassage(passage[:self.passage_char_max], title)
Exemplo n.º 6
0
    def update_prices(self, start_of_week: datetime.date):

        logger.info("Querying DB for most recent prices")
        with connection.cursor() as cursor:
            cursor.execute("""
SELECT
card_printing.id,
face_printing.uuid,
latest_price.date
FROM cards_cardprinting card_printing
JOIN cards_cardfaceprinting face_printing
ON face_printing.card_printing_id = card_printing.id
LEFT JOIN cards_cardprice latest_price
ON latest_price.id = card_printing.latest_price_id
""")
            recent_price_map = {
                uuid: (printing_id, most_recent_date)
                for printing_id, uuid, most_recent_date in cursor.fetchall()
            }

        logger.info("Updating prices")
        # We need to check which printings we've already done in case there are two faces
        # and therefore two price rows the same printing and we don't want to duplicate the prices
        updated_printings = set()
        with open(_paths.PRICES_JSON_PATH, "r",
                  encoding="utf8") as prices_file:
            cards = ijson.kvitems(prices_file, "data")
            for uuid, price_data in cards:
                if uuid not in recent_price_map:
                    logger.warning("No printing found for %s", uuid)
                    continue

                printing_id, latest_price = recent_price_map[uuid]

                if printing_id in updated_printings:
                    logger.info("Already updated %s. Skipping...", uuid)
                    continue

                logger.info("Updating prices for %s", uuid)
                apply_printing_prices(start_of_week, price_data, printing_id,
                                      latest_price)
                updated_printings.add(printing_id)
Exemplo n.º 7
0
    def _fetch_history(self, send_data_conn, request_queue, history_file_path):
        """prepare 1 batch ahead, when received request, immediately return the previously
        prepared batch and prepares the next batch.
        """
        return_batch = {}
        while True:
            historyRange = request_queue.get()
            if type(historyRange) is Traffic_history_service.QueueDone:
                break

            assert isinstance(historyRange, RequestHistoryRange)
            send_data_conn.send(return_batch)
            return_batch = {}
            with open(history_file_path, "rb") as f:
                for index, (t, vehicles_state) in enumerate(
                        ijson.kvitems(f, "", use_float=True)):
                    if (historyRange.start_index <= index
                            and index < historyRange.start_index +
                            historyRange.batch_count):
                        return_batch[t] = vehicles_state
        send_data_conn.close()
Exemplo n.º 8
0
    def fetch_agent_missions(history_file_path: str, scenario_root_path: str,
                             mapLocationOffset):
        assert os.path.isdir(scenario_root_path)
        history_mission_filepath = os.path.join(scenario_root_path,
                                                "history_mission.pkl")

        if not os.path.exists(history_mission_filepath):
            history_mission = {}
        else:
            with open(history_mission_filepath, "rb") as f:
                history_mission = pickle.load(f)

        if history_file_path in history_mission:
            return history_mission[history_file_path]

        vehicle_missions = {}
        with open(history_file_path, "rb") as f:
            for t, vehicles_state in ijson.kvitems(f, "", use_float=True):
                for vehicle_id in vehicles_state:
                    if vehicle_id in vehicle_missions:
                        continue
                    vehicle_missions[vehicle_id] = scenario.Mission(
                        start=scenario.Start(
                            Traffic_history_service.apply_map_location_offset(
                                vehicles_state[vehicle_id]["position"],
                                mapLocationOffset,
                            ),
                            scenario.Heading(
                                vehicles_state[vehicle_id]["heading"]),
                        ),
                        goal=scenario.EndlessGoal(),
                        start_time=float(t),
                    )
        history_mission[history_file_path] = vehicle_missions

        # update cached history_mission_file
        with open(history_mission_filepath, "wb") as f:
            pickle.dump(history_mission, f)

        return vehicle_missions
Exemplo n.º 9
0
    def __init__(self, history_file_path):
        self._history_file_path = history_file_path
        self._all_timesteps = set()
        self._current_traffic_history = {}
        self._prev_batch_history = {}
        # return if traffic history is not used
        if history_file_path is None:
            return

        self._log = logging.getLogger(self.__class__.__name__)
        send_data_conn, receive_data_conn = Pipe()
        self._receive_data_conn = receive_data_conn
        self._request_queue = Queue()
        self._fetch_history_proc = Process(
            target=self._fetch_history,
            args=(
                send_data_conn,
                self._request_queue,
                self._history_file_path,
            ),
        )
        self._fetch_history_proc.daemon = True
        self._fetch_history_proc.start()

        self._range_start = 0
        self._batch_size = 300
        # initialize
        with open(self._history_file_path, "rb") as f:
            for index, (t, vehicles_state) in enumerate(
                    ijson.kvitems(f, "", use_float=True)):
                self._all_timesteps.add(t)
                if (self._range_start <= index
                        and index < self._range_start + self._batch_size):
                    self._current_traffic_history[t] = vehicles_state
        self._range_start += self._batch_size
        # prepares the next batch
        self._prepare_next_batch()
        self._receive_data_conn.recv()
Exemplo n.º 10
0
 def rows(self):
     with open(self._dataset_spec["input_path"], "rb") as inf:
         for t, states in ijson.kvitems(inf, "", use_float=True):
             for state in states.values():
                 yield (t, state)
Exemplo n.º 11
0
def print_tagged_data():

    isentences = iter_sentences()

    tagdata = {}

    count = 0
    with open(args.tags[0], "r", encoding="utf-8") as infile:

        seen = set()

        items = ijson.kvitems(infile, "item")

        for k, v in items:
            if k != "sentences":
                continue

            sid, english, spanish, credits, english_score, spanish_score = next(isentences)
            count += 1
            if not count % 1000 and _INTERACTIVE:
                print(count, end="\r", file=sys.stderr)

            all_tags = []
            first = True
            for s in v:
                for t in s["tokens"]:
                    if first:
                        offset = int(t["begin"])
                        first = False
                    form = get_original_form(t, spanish, offset)
                    pos_tags = []
                    for word in sorted(set([form, t["form"]])):
                        pos_tags += tag_to_pos(t, word)
                    if not pos_tags:
                        continue
                    pos_tags = sorted(list(set(pos_tags)))
                    all_tags += pos_tags
                    for pos_tag in pos_tags:
                        pword, junk, plemma = pos_tag[1].partition("|")
                        if not plemma:
                            plemma = pword
                        if "_" in plemma:
                            for word, lemma in zip(pword.split("_"), plemma.split("_")):
                                if word != lemma:
                                    all_tags.append(["split", f"{word}|{lemma}"])
                                else:
                                    all_tags.append(["split", f"{word}"])

            grouped_tags = group_tags(all_tags)

            # ignore sentences with the same adj/adv/noun/verb lemma combination
            unique_tags = set()
            for pos, tags in grouped_tags.items():
                if pos not in ["adj", "adv", "n", "v", "part-adj", "part-verb"]:
                    continue
                for t in tags:
                    word, junk, lemma = t.partition("|")
                    if not lemma:
                        lemma = word
                    unique_tags.add(lemma)

            uniqueid = hash(":".join(sorted(unique_tags)))

            if uniqueid in seen:
                continue
            seen.add(uniqueid)

            interj = get_interjections(spanish)
            if interj:
                grouped_tags["interj"] = list(map(str.lower, interj))

            tag_str = " ".join(
                [f":{tag}," + ",".join(items) for tag, items in grouped_tags.items()]
            )

            print(f"{english}\t{spanish}\t{credits}\t{english_score}\t{spanish_score}\t{tag_str}")
def add_data(filename, filtered_data, fieldname, valid_old_key_to_new_key):
    with open(filename, 'r') as f:
        objects = ijson.kvitems(f, fieldname)
        filtered_data[fieldname] = {
            valid_old_key_to_new_key[key]: query for key, query in objects if key in valid_old_key_to_new_key
        }
Exemplo n.º 13
0
import ijson
from tqdm import tqdm
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-c',
                    '--captions-path',
                    type=str,
                    required=True,
                    help='path to unfiltered captions')
parser.add_argument('-s',
                    '--save-path',
                    type=str,
                    required=True,
                    help='path to save filtered captions')
args = parser.parse_args()

captions_path = args.captions_path
save_path = args.save_path

ids = os.listdir('saved_features')
filtered_captions = {}

with open(captions_path, 'r') as input_file:
    captions_json = ijson.kvitems(input_file, '')
    for vid_id, captions in tqdm(captions_json):
        if vid_id in ids:
            filtered_captions[vid_id] = captions

json.dump(filtered_captions, open(save_path, 'w'))
Exemplo n.º 14
0
def read_json_by_item(f: io.StringIO) -> Iterator[Tuple[Any, Any]]:
    yield from ijson.kvitems(f, '')