def get_desc(doc_bytearray): doc = document_pb2.Document().FromString(pickle.loads(doc_bytearray)) try: return '{}: {}.'.format( doc.doc_name, doc.document_contents[0].text.split(".")[0]) except: return '{}: .'.format(doc.doc_name)
def build_qrels(spark, parquet_path, qrels_path, doc_count=1000, qrels_type='tree'): """ Build qrels (tree or hierarchical) from 'doc_count' number of preprocessed document data. """ # Read processed df - each row in a TREC CAR Page and preprocessed protobuf message. df = spark.read.parquet(parquet_path) # Sample preprocessed data. fraction = (doc_count / df.count()) + 0.001 df_sample = df.sample(withReplacement=False, fraction=fraction) # Unpack list of preprocessed data. doc_bytearray_list = df_sample.limit(doc_count).select( 'doc_bytearray').collect() document_list = [ document_pb2.Document().FromString(pickle.loads(doc_bytearray[0])) for doc_bytearray in doc_bytearray_list ] # Build qrels (tree or hierarchical). build_synthetic_qrels(document_list=document_list, path=qrels_path, qrels_type=qrels_type)
def write_content_data_to_dir(spark, read_path, dir_path, num_contents=1, chunks=10000, write_output=True): """ Create document content parquet DF by unpacking preprocessed document data.""" # Create new dir to store data chunks if (os.path.isdir(dir_path) == False) and write_output: print('making dir: {}'.format(dir_path)) os.mkdir(dir_path) # Read preprocessed document data. df = spark.read.parquet(read_path) n = int(df.select("index").rdd.max()[0]) content_data = [] chunk = 0 t_start = time.time() # Write chunks of data to files. for i in range(0, n + 1, chunks): # stops when 'num_pages' processed if i >= num_contents: break for df_doc in df.where(df.index.between(i, i + chunks)).collect(): doc_id = df_doc[0] dataset = df_doc[1] doc = document_pb2.Document().FromString(pickle.loads(df_doc[3])) for doc_content in doc.document_contents: # add bytearray of trec_car_tool.Page object content_data.append([ str(doc_content.content_id), str(doc_content.content_type), doc_id, dataset, bytearray(pickle.dumps(doc_content.SerializeToString())) ]) if write_output: print('----- STEP {} -----'.format(i)) time_delta = time.time() - t_start print('time elapse: {} --> time / page: {}'.format( time_delta, time_delta / (i + 1))) write_to_parquet_content(data=content_data, dir_path=dir_path, chunk=chunk) # begin new list content_data = [] chunk += 1 if write_output and (len(content_data) > 0): print('WRITING FINAL FILE: {}'.format(i)) write_to_parquet_content(data=content_data, dir_path=dir_path, chunk=chunk) time_delta = time.time() - t_start print('PROCESSED DATA: {} --> processing time / page: {}'.format( time_delta, time_delta / (i + 1)))
def get_top_ents(doc_bytearray): synthetic_entity_link_totals = document_pb2.Document().FromString( pickle.loads(doc_bytearray)).synthetic_entity_link_totals link_counts = [] for synthetic_entity_link_total in synthetic_entity_link_totals: entity_id = str(synthetic_entity_link_total.entity_id) count = sum([ i.frequency for i in synthetic_entity_link_total.anchor_text_frequencies ]) link_counts.append((entity_id, count)) return [ i[0] for i in sorted(link_counts, key=lambda x: x[1], reverse=True) ][:9]
def parse_article_to_protobuf(self, article): """ """ # Initialise empty message. self.document = document_pb2.Document() self.document.doc_id = article['id'] self.document.doc_name = article['title'] document_content = document_pb2.DocumentContent() document_content.content_id = article['id'] document_content.content_type = 1 document_content.text = article['text'] self.document.document_contents.append(document_content) self.__add_rel_entity_links() self.__add_entity_link_totals() return self.document
def get_first_para(doc_bytearray): doc = document_pb2.Document().FromString(pickle.loads(doc_bytearray)) try: return str(doc.document_contents[0].text) except: return ""