Exemplo n.º 1
0
def partition_CAR(args):
    placeholder_out = ''  # REPLACE WITH ARGS
    cbor_file = 'paragraphCorpus/dedup.articles-paragraphs.cbor'

    # Will fill files until they have 2e9 Bytes
    i = 0  # Starting index for files
    num_bytes = 3e9  # Just for initialization
    size_threshold = 2e9  # Number of bytes to signal creation of new output .tsv file

    for para in read_data.iter_paragraphs(
            open(args.ctx_files_dir + cbor_file, 'rb')):
        if num_bytes >= size_threshold:
            i += 1  # Will start at 1
            print(i)
            print(num_bytes)
            if i >= 2:
                out_file.close(
                )  # Close the file that surpassed size_threshold
            else:
                pass
            out_file = open(args.ctx_files_dir +
                            'CAR_collection_{}.tsv'.format(i),
                            'wt',
                            encoding='utf-8')  # Needed to add encoding
            tsv_writer = csv.writer(out_file, delimiter='\t')

        # Write to file
        tsv_writer.writerow(
            ['CAR' + '_' + para.para_id,
             ' '.join(para.get_text().split())])  # ["CAR_PID", "passage"]

        num_bytes = os.path.getsize(args.ctx_files_dir +
                                    'CAR_collection_{}.tsv'.format(i))

    return None
    def iterate_paragraphs(
            self,
            paragraph_cbor_file,
            func: Callable[[Paragraph, List[Union[ParaLink, ParaText]]], Any],
            max_paras: Optional[int] = None) -> List[Tuple[str, Any]]:
        """
        :param paragraph_cbor_file: Location of the paragraphCorpus.cbor file
        """
        processed_paragraphs = 0
        unique_paragraphs_seen = 0
        total = len(self.paragraphs_to_consider)
        result = []
        with open(paragraph_cbor_file, 'rb') as f:
            for p in iter_paragraphs(f):
                processed_paragraphs += 1
                if processed_paragraphs % 100000 == 0:
                    print("(Searching paragraph cbor): {}".format(
                        processed_paragraphs))

                if max_paras and processed_paragraphs >= max_paras:
                    break

                if p.para_id in self.paragraphs_to_consider:
                    for para in self.paragraphs_to_consider[p.para_id]:
                        result.append((p.para_id, func(para, p.bodies)))

                    unique_paragraphs_seen += 1
                    if unique_paragraphs_seen == total:
                        break

        return result
Exemplo n.º 3
0
def get_paragraphs(paragraphs_file):
    with open(paragraphs_file, 'rb') as f:
        for p in iter_paragraphs(f):
            texts = [
                elem.text if isinstance(elem, ParaText) else elem.anchor_text
                for elem in p.bodies
            ]
            yield p.para_id + '|__|' + (' '.join(texts))
def get_mapping(file_path: str, aspect_to_entity_dict: Dict[str, str]):
    with open(file_path, 'rb') as cbor:
        for para in tqdm.tqdm(read_data.iter_paragraphs(cbor), total=10000):
            for body in para.bodies:
                if isinstance(
                        body,
                        read_data.ParaLink) and body.link_section is not None:
                    aspect_to_entity_dict[body.link_section] = body.pageid
 def parse(self, f):
     for paragraph in iter_paragraphs(f):
         for body in paragraph.bodies:
             if isinstance(body, ParaLink):
                 # print(body)
                 if body.link_section is not None:
                     print("Link Section: {}".format(body.link_section))
                     print("Link Text: {}".format(body.get_text()))
                     print("Link Anchor: {}".format(body.anchor_text))
                     print("Link Page: {}".format(body.page))
def create_database(corpus: str, save: str):
    with open(corpus, 'rb') as cbor:
        id_to_name_dict: Dict[str, str] = dict(
            (body.pageid, body.page)
            for para in tqdm.tqdm(read_data.iter_paragraphs(cbor), total=total)
            for body in para.bodies if isinstance(body, read_data.ParaLink))

    write_to_file(id_to_name_dict, save)

    print('File written to: {}'.format(save))
Exemplo n.º 7
0
 def quick_ids(self):
     counter = 0
     myset = set()
     with open(self.cbor_loc, 'rb') as f:
         for paragraph in iter_paragraphs(f):
             counter += 1
             if counter > 10:
                 break
             myset.add(paragraph.para_id)
     return myset
Exemplo n.º 8
0
    def explore(self):

        progress = 0
        offset = 0

        with open(self.cbor_loc, 'rb') as f:
            for paragraph in iter_paragraphs(f):
                wee = self.get_entities(paragraph)
                self.extract_from_text(paragraph, wee)
                progress += 1
                if progress % 10 == 0:
                    break
Exemplo n.º 9
0
def print_paragraphs(path=WikiParagrahs.file_path_list[0], limit=1):

    print('*** reading {} paragraph from file: {} ***'.format(limit, path))
    with open(path, 'rb') as f:

        counter = 1

        for p in iter_paragraphs(f):

            print()
            print('*** PRINTING PARAGRAPH {} ***'.format(counter))

            print(
                '----------------------- PARAGRAPH ID  -----------------------'
            )

            print(p.para_id)

            print(
                '----------------------- RAW PARAGRAPH  -----------------------'
            )
            print(p)

            # Print just the text
            texts = [
                elem.text if isinstance(elem, ParaText) else elem.anchor_text
                for elem in p.bodies
            ]
            print(
                '----------------------- TEXT  -----------------------------')
            print(' '.join(texts))

            print(
                '----------------------- ENTITIES -----------------------------'
            )
            entities = [
                elem.page for elem in p.bodies if isinstance(elem, ParaLink)
            ]
            print(entities)

            print(
                '----------------------- MIXED -----------------------------')
            mixed = [(elem.anchor_text,
                      elem.page) if isinstance(elem, ParaLink) else
                     (elem.text, None) for elem in p.bodies]
            print(mixed)

            if counter >= limit:
                break

            counter += 1
Exemplo n.º 10
0
    def process_paragraphs(self):
        if not self.freq_dict:
            para_dict = {}
            raw_data = {}
            para_text = {}
            with open(self.paragraph_file, 'rb') as f:
                for p in iter_paragraphs(f):
                    # entities = [elem.page
                    #             for elem in p.bodies
                    #             if isinstance(elem, ParaLink)]# how to retrieve entities from paragraph; p@5 get a bit higher
                    # para_dict[p.para_id] = self.preprocess_text(p.get_text(), ret="freq")
                    raw_data[p.para_id] = self.preprocess_text(p.get_text(),
                                                               ret="raw")
                    para_text[p.para_id] = p.get_text()

            self.freq_dict = para_dict
            self.raw_data = raw_data
            self.para_text = para_text
Exemplo n.º 11
0
    def retrieve_paragraph_mappings(self, cbor_loc):
        """
        :param cbor_loc: Location of the paragraphCorpus.cbor file
        """
        counter = 0
        seen = 0
        total = len(self.paragraphs_to_retrieve)
        with open(cbor_loc, 'rb') as f:
            for p in iter_paragraphs(f):
                counter += 1
                if counter % 100000 == 0:
                    print("(Searching paragraph cbor): {}".format(counter))

                if p.para_id in self.paragraphs_to_retrieve:
                    for p_to_be_updated in self.paragraphs_to_retrieve[
                            p.para_id]:
                        self.update_paragraph(p_to_be_updated, p.bodies)

                    seen += 1
                    if seen == total:
                        break
Exemplo n.º 12
0
    def retrieve_text_matching_ids(self, ids: Set[str]):
        jsons = OrderedDict()  # type: OrderedDict[str, str]

        out = open(self.json_dump_name + ".jsonl", 'w')

        counter = 0

        with open(self.cbor_loc, 'rb') as f:
            for paragraph in iter_paragraphs(f):
                counter += 1
                if paragraph.para_id in ids:
                    jsons[paragraph.para_id] = self.create_json(paragraph)

                    # stop once we've retrieved all of the paragraphs
                    ids.remove(paragraph.para_id)
                    if not ids:
                        break

        for _, json in jsons.items():
            out.write(json + "\n")
        out.close()
Exemplo n.º 13
0
def build_d(read_path=WikiParagrahs.file_path_list[0],
            write_path=write_d_path,
            paragraph_limit=1):

    print('*** reading {} paragraph from file: {} ***'.format(
        paragraph_limit, read_path))
    with open(read_path, 'rb') as f_read:
        with open(write_path, 'w') as f_write:

            counter = 1

            for p in iter_paragraphs(f_read):

                if counter % 10000 == 0:
                    print('{} / {} of paragraphs processed'.format(
                        counter, paragraph_limit))

                f_write.write(p.para_id + '\t' + p.get_text() + '\n')

                if counter >= paragraph_limit:
                    break

                counter += 1
Exemplo n.º 14
0
    def dump_cbor(self):
        out = open(self.json_dump_name + ".jsonl", 'w')
        # index_dir = self.json_dump_name + "_index"
        pmap_out = open(self.json_dump_name + "_pmap.txt", 'w')
        # os.mkdir(index_dir)

        progress = 0
        offset = 0

        with open(self.cbor_loc, 'rb') as f:
            for paragraph in iter_paragraphs(f):
                to_json = self.create_json(paragraph) + "\n"
                out.write(to_json)

                pmap_out.write("{} {} {}\n".format(paragraph.para_id, offset,
                                                   offset + len(to_json) - 1))

                offset += len(to_json)

                progress += 1
                if progress % 10000 == 0:
                    print(progress)
        out.close()
        pmap_out.close()
 def create_paragraph_map(self, paragraph_path):
     for i, page in enumerate(
             read_data.iter_paragraphs(open(paragraph_path, 'rb'))):
         self.paragraph_map[page.para_id] = page.get_text()
Exemplo n.º 16
0
def create_para_id_list(paragraph_cbor_file: str) -> List[str]:
    with open(paragraph_cbor_file, 'rb') as f:
        return [p.para_id for p in iter_paragraphs(f)]