def ingest(self, dataset_name, dataset_source, dataset_description, dataset_author, dataset_notes, dataset_creation_time, dataset_tags, online=True): """ The following will clean, parse, and upload datasets to our database. :param dataset_name: Name of the dataset. :param dataset_source: Source of the dataset (i.e. filename or URL). :param dataset_description: Description of the dataset. :param dataset_author: Author of the dataset. :param dataset_notes: Any notes on the dataset by us. :param dataset_creation_time: Time the dataset was created. :param online: boolean of whether the data is a local file (offline) or a URL (online). """ if CSVParser.is_csv(dataset_source): if online: raw_documents = CSVParser.convert_csv_url_to_json_list( dataset_source) else: raw_documents = CSVParser.convert_csv_file_to_json_list( dataset_source) dataset_attributes = raw_documents[0].keys() es_documents = [ Document(dataset_name, raw_document).get_es_document() for raw_document in raw_documents ] self.es.bulk_upload(es_documents) else: print("Unsupported file format.") metadocument = { "dataset_name": dataset_name, "dataset_description": dataset_description, "dataset_notes": dataset_notes, "dataset_keywords": None, # TODO: Add explicit keywords for datasets through ML "dataset_tags": dataset_tags, "dataset_author": dataset_author, "time_ingested": calendar.timegm(time.gmtime()), "time_created": dataset_creation_time, "dataset_source": dataset_source, "dataset_attributes": dataset_attributes, "dataset_num_docs": len(es_documents), } self.es.bulk_upload( [Metadocument(metadocument, dataset_name).get_es_document()])
def load_docs(self, fname, nrows=None): cntr = 0 with open(fname, 'r') as f: for line in f: idx, text = line.split("\t") idx = int(idx) doc = Document('') doc.set_tokens(text) doc.set_id(idx) self.docs_id[idx] = doc cntr += 1 if nrows is not None: if cntr == nrows: break
def create_from_docs(self, docs_json): # time and log start = time.time() self.logger.info("Creating documents...") # init variables self.docs = [None] * len(docs_json) # load documents and tokenize for i, key in enumerate(docs_json.keys()): progbar(i, len(self.docs), 20) doc = Document(int(key), docs_json[key]) self.docs[int(key)] = doc end = time.time() self.logger.info("Creating document complete. elapsed time: " + str(end - start) + " secs")
def assign_text_to_speaker(body, doc_graph): """ Fills values of dictionary with speakers role, job, and respective text. The full call transcript is populated throughout the values of the dictionary. """ sections = subdivide(body, "^=+") # regex pattern for matching headers of each section header_pattern = re.compile("^.*[^\n]", re.MULTILINE) # regex pattern for matching the sections that contains # the list of attendee's (those that start with asterisks ) #if unicode("Corporate Participants", "utf-8") in sections: ppl_pattern = re.compile("^(\s+\*)(.+)(\s.*)", re.MULTILINE) #else: # ppl_pattern = re.compile("^(\s+\*)(\s.*)", re.MULTILINE) # regex pattern for matching sections with subsections in them. dash_pattern = re.compile("^-+", re.MULTILINE) ppl_d = dict() #ppl_d['Operator'] = ['Operator', 'Operator'] talks_d = dict() header = [] # Step2. Handle each section like a switch case for section in sections: # Handle headers if len(section.split( '\n')) == 1: # likely to match only a header (assuming ) header = header_pattern.match(section).string # Handle attendees/presenters elif ppl_pattern.match(section): #if unicode("Corporate Participants", "utf-8") in sections: ppls = ppl_pattern.findall(section) d = {key.strip(): value.strip() for (_, key, value) in ppls} #else: # ppls = ppl_pattern.findall(section) # ppls_list = [] # for i in ppls: # val = unicode('particiapnt', 'utf-8') # ppls_new = i + (val,) # ppls_list.append(ppls_new) # d = {key.strip(): value.strip() for (_, key, value) in ppls_list} # assuming that if the previous section was detected as a header, then this section will relate # to that header if header: for key, value in d.items(): d[key] = [value, header] ppl_d.update(d) # Handle Presentations/Q&A subsections elif dash_pattern.findall(section): heading, d = process_dashed_sections(section) talks_d.update({heading: d}) for speaker, text in d.items(): if 'operator' in speaker.lower(): continue else: doc = Document(text=text) doc_graph.add_node(doc, text) # Else its just some random text. else: # assuming that if the previous section was detected as a header, then this section will relate # to that header if header: talks_d.update({header: section}) # To assign the talks material (as a list) to the appropriate attendee/presenter. Still works if no match found. for key, value in talks_d.items(): talks_d[key] = assign_attendee(value, ppl_d, doc_graph) return talks_d, doc_graph
def doc_dummy(): return Document('erstes valid', BESCHREIBUNG, 0)