Пример #1
0
def create_test_source_info(filenames: List[str]) -> cr.SourceInfo:
    basenames = [strip_paths(x) for x in filenames]
    return cr.SourceInfo(
        names=basenames,
        name_to_filename={strip_paths(x): x for x in filenames},
        metadata=[{'filename': x, 'year': int(x[2:6])} for x in basenames],
    )
Пример #2
0
 def get_info(self, opts: TextReaderOpts) -> SourceInfo:
     filenames = self.namelist(pattern=opts.filename_pattern)
     basenames = strip_paths(filenames)
     return SourceInfo(
         name_to_filename={strip_paths(name): filename for name, filename in zip(basenames, filenames)},
         names=basenames,
         metadata=extract_filenames_metadata(filenames=basenames, filename_fields=opts.filename_fields),
     )
Пример #3
0
def read_payload(filename: str) -> DocumentPayload:
    filename = replace_extension(filename, ".feather")
    return DocumentPayload(
        content_type=ContentType.TAGGED_FRAME,
        content=pd.read_feather(filename),
        filename=replace_extension(strip_paths(filename), ".csv"),
    )
Пример #4
0
    def create_instream(self) -> Iterable[DocumentPayload]:

        fg = self.token2id.id2token.get
        dg = self.docid2name.get
        pg = self.pipeline.payload.pos_schema.id_to_pos.get

        text_column, pos_column, lemma_column = self.pipeline.payload.tagged_columns_names2

        loaded_frame_columns: set = None

        for filename in tqdm(self.corpus_filenames,
                             total=len(self.corpus_filenames)):

            loaded_frame: pd.DataFrame = self.load_tagged_frame(filename)

            if self.id_to_token:

                if 'token_id' in loaded_frame.columns:
                    loaded_frame[text_column] = loaded_frame.token_id.apply(fg)

                if 'lemma_id' in loaded_frame.columns:
                    loaded_frame[lemma_column] = loaded_frame.lemma_id.apply(
                        fg)

                loaded_frame[pos_column] = loaded_frame.pos_id.apply(pg)

                loaded_frame.drop(columns=['token_id', 'pos_id', 'lemma_id'],
                                  inplace=True,
                                  errors='ignore')

            if 'document_id' not in (loaded_frame_columns
                                     or (loaded_frame_columns := set(
                                         loaded_frame.columns))):

                payload: DocumentPayload = DocumentPayload(
                    content_type=self.out_content_type,
                    content=loaded_frame,
                    filename=strip_paths(filename))
                self.register_pos_counts(payload)

                yield payload

            else:

                for document_id, tagged_frame in loaded_frame.groupby(
                        'document_id'):

                    tagged_frame.reset_index(drop=True, inplace=True)
                    payload: DocumentPayload = DocumentPayload(
                        content_type=self.out_content_type,
                        content=tagged_frame,
                        filename=dg(document_id))
                    self.register_pos_counts(payload)

                    yield payload
Пример #5
0
    def store(self, filename: str) -> "Token2Id":
        """Store dictionary as CSV"""

        # pandas_to_csv_zip(filename, dfs=(self.to_dataframe(), strip_paths(filename)), sep='\t', header=True)
        with zipfile.ZipFile(filename, mode='w', compression=zipfile.ZIP_DEFLATED) as fp:
            data_str = self.to_dataframe().to_csv(sep='\t', header=True)
            fp.writestr(replace_extension(strip_paths(filename), ".csv"), data=data_str)

        self.store_tf(filename)

        return self
Пример #6
0
def apply_filename_fields(document_index: DocumentIndex,
                          filename_fields: FilenameFieldSpecs):
    """Extends document index with filename fields defined by `filename_fields`"""
    if 'filename' not in document_index.columns:
        raise DocumentIndexError("filename not in document index")
    filenames = [
        strip_paths(filename) for filename in document_index.filename.tolist()
    ]
    metadata: List[Mapping[str, Any]] = extract_filenames_metadata(
        filenames=filenames, filename_fields=filename_fields)
    for key, values in list_of_dicts_to_dict_of_lists(metadata).items():
        if key not in document_index.columns:
            document_index[key] = values
    return document_index
Пример #7
0
 def find_tags(folder: str) -> List[str]:
     """Return dump tags in specified folder."""
     known_suffixes = [
         '_vector_data.npz',
         '_vector_data.npy',
         '_vectorizer_data.pickle',
         '_document_index.csv.gz',
     ]
     tags: List[str] = list({
         x[0:len(x) - len(suffix)]
         for suffix in known_suffixes
         for x in strip_paths(glob.glob(jj(folder, f'*{suffix}')))
     })
     return tags
Пример #8
0
    def get_info(self, opts: TextReaderOpts) -> SourceInfo:

        filenames = self.namelist(pattern=opts.filename_pattern)
        basenames = strip_paths(filenames)
        filename_metadata = extract_filenames_metadata(
            filenames=basenames, filename_fields=opts.filename_fields)
        columns = [
            x for x in self.filtered_data.columns.tolist()
            if x != self.text_column
        ]
        dataframe_metadata = self.filtered_data[columns].to_dict('records')
        metadata = [{
            **x,
            **y
        } for x, y in zip(filename_metadata, dataframe_metadata)]
        name_to_filename = {
            strip_paths(name): filename
            for name, filename in zip(basenames, filenames)
        }

        return SourceInfo(name_to_filename=name_to_filename,
                          names=basenames,
                          metadata=metadata)
Пример #9
0
 def metadata(self) -> Sequence[Dict[str, Any]]:
     return self._get_metadata(strip_paths(self._get_filenames()))
Пример #10
0
def test_strip_path():
    assert strip_paths('/tmp/hej.txt') == 'hej.txt'
    assert strip_paths(['/tmp/hej.txt']) == ['hej.txt']
    assert strip_paths('/tmp/hej') == 'hej'
    assert strip_paths('hej.x') == 'hej.x'
Пример #11
0
 def exists(self, filename: str) -> bool:
     return filename in strip_paths(self.filenames)
Пример #12
0
 def namelist(self, *, pattern: str = '*.*') -> List[str]:  # pylint: disable=unused-argument
     return strip_paths(
         [name for name, _ in self.items if fnmatch(name, pattern)])
Пример #13
0
 def __init__(self, items: List[StoreItemPair]):
     self.items: List[StoreItemPair] = items
     self.map = {strip_paths(item[0]): item for item in self.items}
     self.filenames = strip_paths([name for name, _ in self.items])
Пример #14
0
 def namelist(self, *, pattern: str = '*.*') -> List[str]:
     return strip_paths(glob(self.to_path(pattern or '*.*')))