示例#1
0
def sample_text_elements(workspace_id: str,
                         dataset_name: str,
                         sample_size: int,
                         filter_func,
                         remove_duplicates=False,
                         random_state=None) -> Tuple[Sequence, int]:
    """

    :param sample_size: if None, return all elements without sampling
    :param workspace_id: if None no labels info would be used or output
    :param dataset_name:
    :param filter_func:
    :param remove_duplicates:
    :param random_state:
    """

    corpus_df = get_ds_in_memory(dataset_name, remove_duplicates).copy()
    random_state = random_state if random_state else 0
    if workspace_id:
        random_state = sum([ord(c) for c in workspace_id]) + random_state
        labels_dict = get_labels(workspace_id, dataset_name).copy()
        corpus_df['category_to_label'] = [
            dict(labels_dict[u]) for u in corpus_df['uri']
        ]
    corpus_df = filter_func(corpus_df)
    hit_count = len(corpus_df)
    if sample_size and hit_count > sample_size:
        corpus_df = corpus_df.sample(n=sample_size, random_state=random_state)
    result_text_elements = [
        TextElement(*t) for t in corpus_df[
            TextElement.get_field_names()].itertuples(index=False, name=None)
    ]
    return result_text_elements, hit_count
示例#2
0
def get_text_elements(dataset_name: str,
                      uris: Iterable) -> Sequence[TextElement]:
    corpus_df = get_ds_in_memory(dataset_name)
    uris = list(uris)
    corpus_df = corpus_df.loc[corpus_df['uri'].isin(uris)]
    text_elements = [
        TextElement(*t) for t in corpus_df[
            TextElement.get_field_names()].itertuples(index=False, name=None)
    ]
    return text_elements
    def get_all_text_elements(self, dataset_name: str) -> List[TextElement]:
        """
        Return a List of all TextElement in the given dataset_name.

        :param dataset_name: the name of the dataset from which the TextElement should be retrieved.
        """
        return [
            TextElement(*t) for t in logic.get_ds_in_memory(dataset_name)
            [TextElement.get_field_names()].itertuples(index=False, name=None)
        ]
def generate_simple_doc(dataset_name, doc_id=0, add_duplicate=False):
    sentences = [
        'Document Title is Super Interesting',
        'First sentence is not that attractive.',
        'The second one is a bit better.',
        'Last sentence offers a promising view for the future!'
    ]
    if add_duplicate:
        sentences.append('Document Title is Super Interesting')
    text_elements = []
    start_span = 0
    for idx, sentence in enumerate(sentences):
        end_span = start_span + len(sentence)
        text_elements.append(
            TextElement(uri=URI_SEP.join([dataset_name,
                                          str(doc_id),
                                          str(idx)]),
                        text=sentence,
                        span=[(start_span, end_span)],
                        metadata={},
                        category_to_label={}))
        start_span = end_span + 1

    doc = Document(uri=dataset_name + URI_SEP + str(doc_id),
                   text_elements=text_elements,
                   metadata={})
    return doc
示例#5
0
    def _process(self):
        if not os.path.isfile(self.get_raw_data_path()):
            raise Exception(
                f'{self.dataset_part.name.lower()} set file for dataset "{self.dataset_name}" not found'
            )
        all_categories = self._get_all_categories()
        df = pd.read_csv(self.get_raw_data_path(), encoding=self.encoding)

        texts_categories_contexts_doc_ids = [
            (text, category) for text, category in list(
                zip(df[self.text_col], df[self.label_col]))
        ]

        texts_categories_contexts_doc_ids = \
            add_column_or_default_to_zip(texts_categories_contexts_doc_ids, df, self.context_col, None)

        texts_categories_contexts_doc_ids = \
            add_column_or_default_to_zip(texts_categories_contexts_doc_ids, df, self.doc_id_col, 0)

        uri_to_category_labels = []
        prev_doc_id = None
        element_id = -1
        text_span_start = 0
        doc_uri_to_text_elements = defaultdict(list)
        for idx, (text, category, context,
                  doc_id) in enumerate(texts_categories_contexts_doc_ids):
            if prev_doc_id is not None and prev_doc_id != doc_id:
                element_id = -1
                text_span_start = 0

            doc_uri = self.dataset_name + '_' + self.dataset_part.name.lower(
            ) + URI_SEP + str(doc_id)
            element_id += 1
            text_element_uri = doc_uri + URI_SEP + str(element_id)
            metadata = {METADATA_CONTEXT_KEY: context} if context else {}
            text_element = TextElement(uri=text_element_uri,
                                       text=text,
                                       span=[(text_span_start,
                                              (text_span_start + len(text)))],
                                       metadata=metadata,
                                       category_to_label={})
            doc_uri_to_text_elements[doc_uri].append(text_element)
            category_to_label_dict = \
                {cat: Label(labels=self.LABEL_POSITIVE, metadata={}) if cat == category
                else Label(labels=self.LABEL_NEGATIVE, metadata={}) for cat in all_categories}
            uri_to_category_labels.append(
                (text_element_uri, category_to_label_dict))
            prev_doc_id = doc_id
            text_span_start += (len(text) + 1)

        self.documents = [
            Document(uri=doc_uri, text_elements=text_elements, metadata={})
            for doc_uri, text_elements in doc_uri_to_text_elements.items()
        ]
        self.uri_category_labels = uri_to_category_labels
示例#6
0
def generate_simple_doc(dataset_name, doc_id=0):
    sentences = ['with label true', 'with label false', 'no label']
    text_elements = []
    start_span = 0
    for idx, sentence in enumerate(sentences):
        end_span = start_span + len(sentence)
        text_elements.append(
            TextElement(uri=URI_SEP.join([dataset_name, str(doc_id), str(idx)]), text=sentence,
                        span=[(start_span, end_span)], metadata={}, category_to_label={}))
        start_span = end_span + 1

    doc = Document(uri=dataset_name + URI_SEP + str(doc_id), text_elements=text_elements, metadata={})
    return doc
示例#7
0
 def _process(self):
     raw_data_file_path = self.get_raw_data_path()
     all_categories = self._get_all_categories()
     text_elements = []
     uri_to_category_labels = []
     with open(raw_data_file_path, 'r', encoding='latin-1') as f:
         labels_text_split = [
             line.rstrip().split(' ', 1) for line in f.readlines()
         ]
     texts = [
         elem[1].split(self.sep_for_idx)[0] for elem in labels_text_split
     ]
     categories_tuple = [(elem[0].split(':')[0], elem[0])
                         for elem in labels_text_split]
     texts_and_labels = list(
         zip(texts,
             categories_tuple))  # [(text, (coarse-grained, fine-grained))]
     for text_element_id, (text, categories) in enumerate(texts_and_labels):
         uri = self.doc_uri + URI_SEP + str(text_element_id)
         text_elements.append(
             TextElement(uri=uri,
                         text=text,
                         span=[(0, len(text))],
                         metadata={},
                         category_to_label={}))
         category = categories[
             1] if self.use_fine_grained_labels else categories[0]
         category_to_label_dict = {
             cat: Label(labels=self.LABEL_POSITIVE, metadata={}) if cat
             == category else Label(labels=self.LABEL_NEGATIVE, metadata={})
             for cat in all_categories
         }
         uri_to_category_labels.append((uri, category_to_label_dict))
     self.documents = [
         Document(uri=self.doc_uri,
                  text_elements=text_elements,
                  metadata={})
     ]
     self.uri_category_labels = uri_to_category_labels