def add_dataset(self, dataset): self._check_dataset(dataset) self.dataset = dataset logfile = "index.log.dataset.%s.txt" % self.dataset self.log = open(os.path.join(self.idx_dir, logfile), 'w') self.classify_dir = os.path.join(corpus, 'data', 't2_classify', dataset) fname = os.path.join(self.classify_dir, 'classify.MaxEnt.out.s2.y.nr') fh = open_input_file(fname) years = {} terms = {} self.log.write("$ python %s\n\n" % ' '.join(sys.argv)) self._write_message("Collecting terms...") count = 0 t1 = time.time() step = 100000 for line in fh: count += 1 #if count > 100000: break if count % step == 0: t2 = time.time() self._write_message( " loaded %s classifier lines in %.2f seconds (%sK done)" % (step, t2 - t1, count / 1000)) t1 = t2 (id, score) = line.rstrip().split("\t") (year, doc, term) = id.split("|", 2) score = float(score) self._update_years_idx(year, doc, years) self._update_terms_idx(term, year, score, terms) self._write_message("Updating databases...") self._update_years_db(years) self._update_terms_db(terms)
def add_phr_feats_file(phr_feats_file, s_mallet): """Loop through phr_feats_file and add the first 30 lines to s_mallet. Only add the lines if the chunk is in the title or abstract.""" # TODO: was originally imported from run_iclassify, belongs in mallet? # TODO: should maybe be in separate utilities file (classifier_utils.py) global output_count num_lines_output = 0 # handle compressed or uncompressed files s_phr_feats = open_input_file(phr_feats_file) # keep first 30 chunks, if they are from title/abstract num_chunks = 0 for line in s_phr_feats: if num_chunks >= 30: break line = line.strip("\n") if line.find("TITLE") > 0 or line.find("ABSTRACT") > 0: l_data = line.split("\t") chunkid = l_data[0] year = l_data[1] phrase = l_data[2] l_feats = l_data[3:] key = make_instance_key(chunkid, year, phrase) # add dummy "n" as class label instance_line = key + " n " + " ".join(l_feats) + "\n" output_count += 1 s_mallet.write(instance_line) num_chunks += 1 num_lines_output += 1 s_phr_feats.close() return num_lines_output
def add_file_to_utraining_test_file(fname, s_test, d_phr2label, d_features, stats, use_all_chunks_p=True, default_label='n'): """Add document features from fname as vectors to s_test. This was factored out from make_utraining_test_file() so that it could be called by itself.""" def incr(x): stats[x] += 1 fh = open_input_file(fname) year, doc_id = get_year_and_docid(fname) docfeats = generate_doc_feats(fh, doc_id, year) for term in sorted(docfeats.keys()): feats = docfeats[term][2:] # use only the features used by the model if d_features: feats = [f for f in feats if d_features.has_key(f.split("=")[0])] uid = "%s|%s|%s" % (year, doc_id, term.replace(' ', '_')) feats = sorted(unique_list(feats)) incr('labeled_count') if d_phr2label.has_key(term) else incr( 'unlabeled_count') # include the instance if all chunks are used or if it doesn't have a label. if use_all_chunks_p == True or not d_phr2label.has_key(term): mallet_list = [uid, default_label] + feats # mallet line format: "uid label f1 f2 f3 ..." mallet_line = u" ".join(mallet_list) + u"\n" s_test.write(mallet_line) incr('total_count') fh.close()
def read_roles(year): roles = {} fname = os.path.join(KEYTERMS_CLASS, year, 'iclassify.MaxEnt.label.merged.tab') print fname c = 0 for line in open_input_file(fname): c += 1 if c % 100000 == 0: print c #if c > 1000: break (id, basename, role, term) = line.rstrip("\n\r\f").split("\t") roles[term][role] = roles.setdefault(term, {}).get(role, 0) + 1 return roles
def collect_terms_in_corpus(corpus): t1 = time.time() terms = {} done = 0 for line in open(os.path.join(corpus, 'config', FILELIST)): done += 1 if done % 100 == 0: print done #if done >= 100: break fname = line.split()[2] fname = os.path.join(corpus, 'data', 'd3_phr_feats', '01', 'files', fname) for line in open_input_file(fname): term = line.split("\t")[2] terms[term] = terms.get(term, 0) + 1 return terms
def run_matcher_on_file(self, fname, fh): infile = open_input_file(fname) for line in infile: (id, year, term, feats) = parse_feats_line(line) self.feature_statistics.add(feats) prev_V = feats.get('prev_V', None) #initial_V = feats.get('initial_V', None) #chunk_lead_VBG = feats.get('chunk_lead_VBG', None) #if prev_V is not None: # fh.write("%s\t%s\t%s\t%s\n" % (year, id, term , prev_V)) for pattern in self.patterns: matched_features = pattern.matches(feats) if matched_features is not None: fh.write("%s\t%s\t%s\t%s\t%s\n" % (year, id, pattern.name, term, matched_features))
def count_tokens_in_corpus(corpus): t1 = time.time() file_count = 0 sentence_count = 0 token_count = 0 done = 0 for line in open(os.path.join(corpus, 'config', 'files.txt')): #if done >= 100: break fname = line.split()[2] fname = os.path.join(corpus, 'data', 'd2_tag', '01', 'files', fname) file_count += 1 for line in open_input_file(fname): sentence_count += 1 token_count += len(line.split()) done += 1 print corpus, file_count, sentence_count, token_count, "(%d seconds)" \ % (time.time() - t1)
def _process_file(self, fname, fh): self.locations = {} infile = open_input_file(fname) for l in infile: parsed_line = parse_feats_line(l) year = parsed_line[1] term = parsed_line[2] feats = parsed_line[3] path = year + os.sep + os.path.splitext(parsed_line[0])[0] line = feats.get('doc_loc', '-1') key = path + "\t" + term if not self.locations.has_key(key): self.locations[key] = [] self.locations[key].append(line) for key, lines in self.locations.items(): path, term = key.split("\t", 1) fh.write("%s\t%s\t%s\t%s\n" % (path, term, len(lines), ' '.join(lines)))
def collect_counts(dataset, filelist): """Return a dictionary with for each term the number of documents it appeared in. This assumes that the dataset is a d3_phr_feats dataset.""" counts = {} fnames = filename_generator(dataset.path, filelist) for fname in fnames: if verbose: print '[collect_counts]', fname # TODO: this is dangerous because it makes assumptions about the # directory structure, something similar was the case in step2 for at # least the docfeats generation year = os.path.basename(os.path.dirname(fname)) doc_id = os.path.basename(fname) with open_input_file(fname) as fh: docfeats = generate_doc_feats(fh, doc_id, year) for term in docfeats.keys(): counts[term] = counts.get(term, 0) + 1 return counts
def annotate_something(dirname, rconfig, filelist, chunks): """This is a stub method that explains a bit more on how to create annotation files. Includes scaffolding that shows how to pull information out of phrase feature and tag files. This is for cases when you use a list of files.""" # Here is how you get the datasets dataset_tags = find_input_dataset(rconfig, 'd2_tag') dataset_feats = find_input_dataset(rconfig, 'd3_phr_feats') # Check whether files from the file list are available check_file_availability(dataset_tags, filelist) check_file_availability(dataset_feats, filelist) # Next would typically be some way of writing down the information, the # following writes general information (command used, corpus directory as # well as git commit) and the list of files used. This also creates the # output directory. write_info(rconfig, dirname, filelist) # Now we can get the file names, loop over them, and extract the needed # information. The code below is some scaffolding if all you need is in one # dataset. fnames = filename_generator(dataset_feats.path, filelist) for fname in fnames: with open_input_file(fname) as fh: # extract data from the line, you may want to put it in some # temporary data structure for line in fh: pass # And this is what you do if you need information that is distributed over # the feature and tag files. tag_files = list(filename_generator(dataset_tags.path, filelist)) feat_files = list(filename_generator(dataset_feats.path, filelist)) for i in range(len(tag_files)): # the FileData object fd = FileData(tag_files[i], feat_files[i]) # all term-related stuff lives in the Term object and its term_instances # variable, you can print to the annotation file(s) from here or first # build some intermediate data structure and then print the output later for term in fd.get_terms(): term_obj = fd.get_term(term)
def _itrainer_create_dat_file(phr_feats_file, corpus, filelist): """Create the keyfeats.ta.dat file, which is a concatenation of all the files in filelist, but using only the first 100 terms in each file (because annotation does not go beyond those 100).""" print "[_itrainer_create_dat_file] creating", phr_feats_file print "[_itrainer_create_dat_file] from", corpus phr_feats_fh = codecs.open(phr_feats_file, 'w', encoding='utf-8') for line in open(filelist): (year, full_path, short_path) = line.split() # TODO: this is a hack, change this to use the filename generator and # the default_config and such fname = os.path.join(corpus, 'data/d3_phr_feats/01/files', short_path) # + '.gz') fh = open_input_file(fname) for line in fh: term_no = int(line.split()[0].split('_')[1]) # no need to get too far into the file if term_no > 100: break phr_feats_fh.write(line) phr_feats_fh.close()
def make_utraining_file3(self, fnames, d_phr2label, verbose=False): """Create a file with training instances for Mallet. The list of phrase feature files to use is given in fnames and the annotated terms in d_phr2label. Also sets a couple of instance variables with statistics on labeled and unlabeled instances and types: stats_unlabled_count has a count of all instances (that is, term-document pairs) in the files in fnames without labels, stats_labeled_count has the number of all labeled instances, and stats_terms has a dictionary of terms to number of labeled instances per term. This method is based on a similarly named function in train.py. """ mallet_file = self.mallet_config.train_mallet_file if verbose: print "[mallet.make_utraining_file3] writing to", mallet_file print "[mallet.make_utraining_file3] features used:", \ sorted(self.d_features.keys()) self.stats_labeled_count = 0 self.stats_labeled_count_y = 0 self.stats_labeled_count_n = 0 self.stats_unlabeled_count = 0 self.stats_terms = {} self.stats_terms_y = {} self.stats_terms_n = {} file_count = 0 s_train = codecs.open(mallet_file, 'w', encoding='utf-8') for phr_feats_file in fnames: file_count += 1 if verbose: print "%05d %s" % (file_count, phr_feats_file) year, doc_id = get_year_and_docid(phr_feats_file) with open_input_file(phr_feats_file) as fh: # this hard-wires the use of union train docfeats = generate_doc_feats(fh, doc_id, year) for term in sorted(docfeats.keys()): feats = docfeats[term][2:] feats = self.remove_filtered_feats(feats) uid = "%s|%s|%s" % (year, doc_id, term.replace(' ','_')) if d_phr2label.has_key(term): label = d_phr2label.get(term) if label == "": print "[mallet.make_utraining_file3] " + \ "WARNING: term with null label: %s" % term elif label in ('y', 'n'): self.stats_terms[term] = self.stats_terms.get(term, 0) + 1 d = self.stats_terms_y if label == 'y' else self.stats_terms_n d[term] = d.get(term, 0) + 1 # mallet line format: "uid label f1 f2 f3 ..." mallet_line = " ".join([uid, label] + feats) s_train.write(mallet_line + "\n") self.stats_labeled_count += 1 else: self.stats_unlabeled_count += 1 if verbose: print "[make_utraining_file3] labeled instances: %i, unlabeled: %i, labeled types: %i" \ % (self.stats_labeled_count, self.stats_unlabeled_count, len(self.stats_terms))
def create_json_chunks_file(index_name, type_name, corpus, start, end, docs_per_bulk_load=500, section_filter_p=True, write_to_file_p=False): # reading from fuse pipeline data # writing to local tv corpus dir # for years from start to end # we'll need the name of the pipeline step to create the directory path to # the phr_feats files. pipeline_step = "d3_phr_feats" # range parameters start_year = int(start) end_year = int(end) start_range = start_year end_range = end_year + 1 # track the time in <year>.log log_file = pnames.tv_dir_year_file(corpus_root, corpus, "all", "log") s_log = open(log_file, "w") log_message = "Starting create_json_chunks_file for years: " + str(start) + " " + str(end) time = log.log_current_time(s_log, log_message, True) # remember the start_time for computing total time start_time = time # we'll bulk load all the data for a single year. # the argument to elasticsearch bulk is a list of dictionaries # alternating metadata and content. We'll build this up in l_bulk_elements # The output is a list of lists, where each list contains the meta/content elements for n files l_colloc_bulk_lists = [] l_colloc_bulk_elements = [] d_chunk2prev_Npr = defaultdict(set) d_chunk2prev_V = defaultdict(set) d_chunk2doc = defaultdict(set) for year in range(start_range, end_range): # loop through files in file_list_file for the year filelist_file = pnames.fuse_filelist(fuse_corpus_root, corpus, year) s_file_list = open(filelist_file) # track the number of lines output to json file num_lines_output = 0 json_file = pnames.tv_dir(corpus_root, corpus) + str(year) + ".chunks.json" s_json = codecs.open(json_file, "w", encoding='utf-8') file_count = 0 for line in s_file_list: # if we have reached the file limit for a single bulk api call, add the sublist to l_colloc_bulk_lists # and start a new sublist if (file_count % docs_per_bulk_load) == 0: # mod will be 0 for initial time through loop, so ignore this sublist if l_colloc_bulk_elements != []: l_colloc_bulk_lists.append(l_colloc_bulk_elements) l_colloc_bulk_elements = [] file_count += 1 line = line.strip("\n") # get the date/filename portion of path l_line_fields = line.split("\t") # get the rest of the file path (publication_year/id.xml) pub_year_and_file = l_line_fields[2] # extract patent_id from the filename (e.g. US5787464A from 1998/020/US5787464A.xml) patent_id = os.path.splitext(os.path.basename(pub_year_and_file))[0] phr_feats_file = pnames.fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, pub_year_and_file) #print "[invention]opening phr_feats: %s, id: %s" % (phr_feats_file, patent_id) #sys.exit() #s_phr_feats = codecs.open(phr_feats_file, encoding='utf-8') # handle compressed or uncompressed files s_phr_feats = open_input_file(phr_feats_file) # we need to combine all the chunks from a single sentence into one output entry l_chunks = [] # assume the first sent_no in a document will always be 0 last_sent_no = "0" for line in s_phr_feats: # todo make into regex /// if not(section_filter_p) or line.find("TITLE") > 0 or line.find("ABSTRACT") > 0 or line.find("SUMMARY") > 0: # then process the line l_data = line.split("\t") # save chunk as phrase with "_" instead of blank connecting tokens chunk = l_data[2].replace(" ", "_") # extract the value field from the doc_loc feature to get the sent_no sent_no = p_doc_loc.search(line).group(1) # populate chunk dictionaries d_chunk2docs[chunk].add(patent_id) prev_V = p_prev_V.search(line) if prev_V != None: d_chunk2prev_V[chunk].add(prev_V) prev_Npr = p_prev_Npr.search(line) if prev_Npr != None: d_chunk2prev_Npr[chunk].add(prev_Npr) if sent_no == last_sent_no: l_chunks.append(chunk) else: # we are done with the sentence, so write out the chunk list json_string = format_colloc_chunks2json(patent_id, year, last_sent_no, l_chunks) uid = "_".join([patent_id, last_sent_no]) #print "last_sent_no: %s, chunks: %s, json: %s" % (last_sent_no, l_chunks, json_string) # note the above print gives an error for non-asci chars. if write_to_file_p: # make a json file with all the data to be loaded into elasticsearch s_json.write("%s\n" % json_string) l_colloc_bulk_elements.append(format_d_action(index_name, type_name, uid)) l_colloc_bulk_elements.append(format_colloc_d_content(patent_id, year, last_sent_no, l_chunks)) # keep the current chunk l_chunks = [chunk] last_sent_no = sent_no num_lines_output += 1 # output the last line json_string = format_colloc_chunks2json(patent_id, year, last_sent_no, l_chunks) #print "last_sent_no: %s, chunks: %s, json: %s" % (last_sent_no, l_chunks, json_string) s_json.write("%s\n" % json_string) l_colloc_bulk_elements.append(format_d_action(index_name, type_name, uid)) l_colloc_bulk_elements.append(format_colloc_d_content(patent_id, year, last_sent_no, l_chunks)) num_lines_output += 1 #""" # stop after n files for debugging if file_count > 3000: break #""" s_phr_feats.close() # add the remaining elements to l_colloc_bulk_lists l_colloc_bulk_lists.append(l_colloc_bulk_elements) print "[docs.py]%i lines from %i files written to %s" % (num_lines_output, file_count, json_file) s_json.close() s_log.close() s_file_list.close() """ # unfinished section to create chunk index # prepare data for chunk index for chunk in d_chunk2docs.keys(): l_docs = d_chunk2docs[chunk] l_prev_V = d_chunk2prev_V[chunk] l_prev_Npr = d_chunk2prev_Npr[chunk] """ # todo: eventually, return two lists return(l_colloc_bulk_lists)
def process_doc(self, filter_p=True, chunker_rules='en'): """Process the doc, creating all potential technology chunks and calculating their features.""" debug_p = False if debug_p: print "[process_doc] filter_p: %s, writing to %s" % \ (filter_p, self.output) s_input = open_input_file(self.input) s_output = open_output_file(self.output, compress=self.compress) section = "FH_NONE" # default section if document has no section header lines self.d_field[section] = [] sent_no_in_section = 0 for line in s_input: line = line.strip("\n") if debug_p: print "[process_doc] line: %s" % line if line[0:3] == "FH_": # we are at a section header; note we have to strip off both # final ':' and whitespace, since in some cases eg. Chinese # segmentation, the colon will be separated from the header term # by a blank. section = line.split("_")[1].rstrip(": ") self.d_field[section] = [] sent_no_in_section = 0 else: # process the sentence, the line is a list of token_tag pairs if section == "TITLE" or section == "ABSTRACT": self.l_lc_title_noun.extend(lc_nouns(line)) # call the appropriate Sentence subclass based on the language sent_args = [ self.next_sent_id, section, sent_no_in_section, line, self.chunk_schema ] sent = sentence.get_sentence_for_lang(self.lang, sent_args) # get context info i = 0 for chunk in sent.chunk_iter(): if chunk.label == "tech": # index of chunk start in sentence => ci ci = chunk.chunk_start hsent = sent.highlight_chunk(i) mallet_feature_list = get_features(sent, ci) mallet_feature_list.sort() uid = os.path.basename(self.input) + "_" + str( self.next_chunk_id) metadata_list = [uid, self.year, chunk.phrase.lower()] if debug_p: print "index: %i, start: %i, end: %i, sentence: %s" % \ (i, chunk.chunk_start, chunk.chunk_end, sent.sentence) if add_chunk_data(self, chunk, section, filter_p): add_line_to_phr_feats(metadata_list, mallet_feature_list, s_output) chunk.sid = self.next_sent_id self.d_chunk[self.next_chunk_id] = chunk sent.chunks.append(chunk) self.next_chunk_id += 1 i = chunk.chunk_end # keep track of the location of this sentence within the section sent_no_in_section += 1 self.d_field[section].append(sent) self.d_sent[self.next_sent_id] = sent self.next_sent_id += 1 s_input.close() s_output.close()
#print line, #print len(features), features (year, fname, term) = id.split('|', 2) #print label, term return label, term, features mallet_file = sys.argv[1] info_file = mallet_file + '.stats.txt' pos_terms = {} neg_terms = {} features = {} featvals = {} with open_input_file(mallet_file) as fh: count = 0 for line in fh: count += 1 #if count > 10000: break if count % 100000 == 0: print count label, term, feats = parse_mallet_line(line) if label == 'y': pos_terms[term] = pos_terms.get(term, 0) + 1 elif label == 'n': neg_terms[term] = neg_terms.get(term, 0) + 1 for featval in feats: feat, val = featval.split('=', 1) #if feat == '234_shore': print line features[feat] = features.get(feat, 0) + 1 if not featvals.has_key(feat):
def gen_bulk_lists(index_name, type_name, domain, corpus, start, end, lines_per_bulk_load=100, section_filter_p=True, write_to_file_p=False, max_lines=0): # reading from fuse pipeline data # writing to local tv corpus dir # for years from start to end # we'll need the name of the pipeline step to create the directory path to # the phr_feats files. pipeline_step = "d3_phr_feats" ###print "corpus_root: %s, corpus: %s" % (corpus_root, str(corpus)) # range parameters start_year = int(start) end_year = int(end) start_range = start_year end_range = end_year + 1 # track the time in <year>.log log_file = pnames.tv_dir_year_file(corpus_root, corpus, "all", "log") s_log = open(log_file, "w") log_message = "[es_np.py gen_bulk_lists]Starting make_bulk_lists for years: " + str(start) + " " + str(end) time = log.log_current_time(s_log, log_message, True) # remember the start_time for computing total time start_time = time # we'll bulk load all the data for a single year. # the argument to elasticsearch bulk is a list of dictionaries # alternating metadata and content. We'll build this up in l_bulk_elements # The output is a list of flattened paired elements, where each list contains the meta/content elements for n lines #l_bulk_lists = [] l_bulk_elements = [] for year in range(start_range, end_range): # loop through files in file_list_file for the year filelist_file = pnames.fuse_filelist(fuse_corpus_root, corpus, year) s_file_list = open(filelist_file) # track the number of lines output to json file num_lines_output = 0 json_file = pnames.tv_dir(corpus_root, corpus) + str(year) + ".chunks.json" s_json = codecs.open(json_file, "w", encoding='utf-8') file_count = 0 ###pdb.set_trace() for line in s_file_list: ###pdb.set_trace() file_count += 1 line = line.strip("\n") # get the date/filename portion of path l_line_fields = line.split("\t") # get the rest of the file path (publication_year/id.xml) pub_year_and_file = l_line_fields[2] # extract patent_id from the filename (e.g. US5787464A from 1998/020/US5787464A.xml) patent_id = os.path.splitext(os.path.basename(pub_year_and_file))[0] # create a "doc" type entry to be bulk loaded. This will be the parent of both "sent" # and "np" records in the index l_bulk_elements.append(format_d_action(index_name, "doc", patent_id)) l_bulk_elements.append(format_doc_d_content(domain, year, patent_id)) # lists to capture each sent's sheads and sterms sheads = [] sterms = [] # loc is the sentence number in the document, starting at 0 current_sent = 0 # Assume the initial section will be TITLE current_section = "TITLE" num_lines_output += 1 # end creating doc index entry phr_feats_file = pnames.fuse_phr_feats_file(fuse_corpus_root, corpus, pipeline_step, year, pub_year_and_file) #print "[invention]opening phr_feats: %s, id: %s" % (phr_feats_file, patent_id) #sys.exit() #s_phr_feats = codecs.open(phr_feats_file, encoding='utf-8') # handle compressed or uncompressed files s_phr_feats = open_input_file(phr_feats_file) for line in s_phr_feats: # if we have reached the line limit for a single bulk api call, add the sublist to l_bulk_lists # and start a new sublist if (num_lines_output % lines_per_bulk_load) == 0: ###print "num_lines_output: %i" % num_lines_output # mod will be 0 for initial time through loop, so ignore this sublist if l_bulk_elements != []: yield l_bulk_elements l_bulk_elements = [] # todo make into regex /// # Note that DESC was added 3/38/15, so indices created earlier do not contain that section. if not(section_filter_p) or line.find("TITLE") > 0 or line.find("ABSTRACT") > 0 or line.find("SUMMARY") > 0 or line.find("DESC") > 0: # then process the line l_data = line.split("\t") # chunk is phrase with blanks connecting tokens uid = l_data[0] # uid is doc_id + phrase number phr = l_data[2] # phrase with whitespace separating words # extract the value field from the doc_loc feature to get the loc (sentence number) loc = p_doc_loc.search(line).group(1) # We will store it as an integer in es loc = int(loc) section = p_section.search(line).group(1) pos = p_pos.search(line).group(1) pos = pos.replace("_", " ") # populate chunk dictionaries prev_V = p_prev_V.search(line) if prev_V != None: # extract the matched string (group 0 is the entire match, while # group 1 is the first parenthesized subexpression in the pattern) prev_V = prev_V.group(1) prev_Npr = p_prev_Npr.search(line) if prev_Npr != None: prev_Npr = prev_Npr.group(1) prev_J = p_prev_J.search(line) if prev_J != None: # extract the matched string (group 0 is the entire match, while # group 1 is the first parenthesized subexpression in the pattern) prev_J = prev_J.group(1) ###pdb.set_trace() l_bulk_elements.append(format_d_action(index_name, "np", uid, parent_id=patent_id)) d_field_content = format_np_d_content(phr, prev_Npr, prev_V, prev_J, domain, year, patent_id, loc, section, pos) l_bulk_elements.append(d_field_content) # We will use data in d_field_content to avoid recomputing fields for sent. shead = d_field_content["chead"] sterm = d_field_content["cterm"] # section can change whenever loc changes section = d_field_content["section"] # if loc != current_sent, we need to store a sent record for the current_loc if loc != current_sent: # store the record and start populating a new one sent_id = patent_id + "_" + str(current_sent) l_bulk_elements.append(format_d_action(index_name, "sent", sent_id, parent_id=patent_id)) l_sent_dict = format_sent_d_content(domain, year, patent_id, current_section, current_sent, sheads, sterms) l_bulk_elements.append(l_sent_dict) ###print "Adding sent: %s, sent_dict: %s" % (sent_id, l_sent_dict) # re-initialize the sheads and sterms lists sheads = [ shead ] sterms = [ sterm ] # increment count for "sent" output num_lines_output += 1 # update the current_sent and section current_sent = loc current_section = section else: # we are still in the same sentence. # add the latest term/head to the sent fields for current_sent sheads.append(shead) sterms.append(sterm) # increment count for "np" output num_lines_output += 1 # stop after max_lines files for debugging ###print "num_lines_output: %i, max_lines: %i" % (num_lines_output, max_lines) if (max_lines != 0) and num_lines_output > max_lines: break # break out of file loop as well if (max_lines != 0) and num_lines_output > max_lines: break # We need to store a sent record for the last sentence in last file (= current_sent) sent_id = patent_id + "_" + str(current_sent) ###print "[gen_bulk_list]last sent_id: %s, sheads: %s, sterms: %s\n" % (sent_id, sheads, sterms) l_bulk_elements.append(format_d_action(index_name, "sent", sent_id, parent_id=patent_id)) l_bulk_elements.append(format_sent_d_content(domain, year, patent_id, current_section, current_sent, sheads, sterms)) num_lines_output += 1 s_phr_feats.close() s_json.close() log_message = "[es_np_index.py]Completed make_bulk_lists for years: " + str(start) + " " + str(end) + ". Number of lines: " + str(num_lines_output) time = log.log_current_time(s_log, log_message, True) s_log.close() s_file_list.close() # yield the last remaining l_bulk_elements print "[gen_bulk_lists]%i lines from %i files written to index %s" % (num_lines_output, file_count, index_name) yield(l_bulk_elements)