예제 #1
0
    def ingest(self,
               dataset_name,
               dataset_source,
               dataset_description,
               dataset_author,
               dataset_notes,
               dataset_creation_time,
               dataset_tags,
               online=True):
        """
        The following will clean, parse, and upload datasets to our database.

        :param dataset_name: Name of the dataset.
        :param dataset_source: Source of the dataset (i.e. filename or URL).
        :param dataset_description: Description of the dataset.
        :param dataset_author: Author of the dataset.
        :param dataset_notes: Any notes on the dataset by us.
        :param dataset_creation_time: Time the dataset was created.
        :param online: boolean of whether the data is a local file (offline) or a URL (online).
        """
        if CSVParser.is_csv(dataset_source):
            if online:
                raw_documents = CSVParser.convert_csv_url_to_json_list(
                    dataset_source)
            else:
                raw_documents = CSVParser.convert_csv_file_to_json_list(
                    dataset_source)
            dataset_attributes = raw_documents[0].keys()
            es_documents = [
                Document(dataset_name, raw_document).get_es_document()
                for raw_document in raw_documents
            ]
            self.es.bulk_upload(es_documents)
        else:
            print("Unsupported file format.")

        metadocument = {
            "dataset_name": dataset_name,
            "dataset_description": dataset_description,
            "dataset_notes": dataset_notes,
            "dataset_keywords":
            None,  # TODO: Add explicit keywords for datasets through ML
            "dataset_tags": dataset_tags,
            "dataset_author": dataset_author,
            "time_ingested": calendar.timegm(time.gmtime()),
            "time_created": dataset_creation_time,
            "dataset_source": dataset_source,
            "dataset_attributes": dataset_attributes,
            "dataset_num_docs": len(es_documents),
        }
        self.es.bulk_upload(
            [Metadocument(metadocument, dataset_name).get_es_document()])
예제 #2
0
    def load_docs(self, fname, nrows=None):
        cntr = 0
        with open(fname, 'r') as f:
            for line in f:
                idx, text = line.split("\t")
                idx = int(idx)
                doc = Document('')
                doc.set_tokens(text)
                doc.set_id(idx)
                self.docs_id[idx] = doc

                cntr += 1
                if nrows is not None:
                    if cntr == nrows:
                        break
예제 #3
0
    def create_from_docs(self, docs_json):
        # time and log
        start = time.time()
        self.logger.info("Creating documents...")

        # init variables
        self.docs = [None] * len(docs_json)

        # load documents and tokenize
        for i, key in enumerate(docs_json.keys()):
            progbar(i, len(self.docs), 20)
            doc = Document(int(key), docs_json[key])
            self.docs[int(key)] = doc

        end = time.time()
        self.logger.info("Creating document complete. elapsed time: " +
                         str(end - start) + " secs")
def assign_text_to_speaker(body, doc_graph):
    """ Fills values of dictionary with speakers role, job, and respective text. 
    The full call transcript is populated throughout the values of the dictionary.
    """

    sections = subdivide(body, "^=+")
    # regex pattern for matching headers of each section
    header_pattern = re.compile("^.*[^\n]", re.MULTILINE)

    # regex pattern for matching the sections that contains
    # the list of attendee's (those that start with asterisks )
    #if unicode("Corporate Participants", "utf-8") in sections:
    ppl_pattern = re.compile("^(\s+\*)(.+)(\s.*)", re.MULTILINE)
    #else:
    #    ppl_pattern = re.compile("^(\s+\*)(\s.*)", re.MULTILINE)

    # regex pattern for matching sections with subsections in them.
    dash_pattern = re.compile("^-+", re.MULTILINE)

    ppl_d = dict()
    #ppl_d['Operator'] = ['Operator', 'Operator']
    talks_d = dict()

    header = []
    # Step2. Handle each section like a switch case
    for section in sections:
        # Handle headers
        if len(section.split(
                '\n')) == 1:  # likely to match only a header (assuming )
            header = header_pattern.match(section).string

        # Handle attendees/presenters
        elif ppl_pattern.match(section):
            #if unicode("Corporate Participants", "utf-8") in sections:
            ppls = ppl_pattern.findall(section)
            d = {key.strip(): value.strip() for (_, key, value) in ppls}
            #else:
            #    ppls = ppl_pattern.findall(section)
            #    ppls_list = []
            #    for i in ppls:
            #        val = unicode('particiapnt', 'utf-8')
            #        ppls_new = i + (val,)
            #        ppls_list.append(ppls_new)
            #    d = {key.strip(): value.strip() for (_, key, value) in ppls_list}

            # assuming that if the previous section was detected as a header, then this section will relate
            # to that header
            if header:
                for key, value in d.items():
                    d[key] = [value, header]
            ppl_d.update(d)

        # Handle Presentations/Q&A subsections
        elif dash_pattern.findall(section):
            heading, d = process_dashed_sections(section)
            talks_d.update({heading: d})
            for speaker, text in d.items():
                if 'operator' in speaker.lower():
                    continue
                else:
                    doc = Document(text=text)
                    doc_graph.add_node(doc, text)

        # Else its just some random text.
        else:

            # assuming that if the previous section was detected as a header, then this section will relate
            # to that header
            if header:
                talks_d.update({header: section})

    # To assign the talks material (as a list) to the appropriate attendee/presenter. Still works if no match found.
    for key, value in talks_d.items():
        talks_d[key] = assign_attendee(value, ppl_d, doc_graph)

    return talks_d, doc_graph
예제 #5
0
파일: doc_mock.py 프로젝트: go3tt/rest
def doc_dummy():
    return Document('erstes valid', BESCHREIBUNG, 0)