def __init__(self, synonyms_collection_filepath, frames_collection_filepath, init_ner=True, init_stemmer=True, init_frames=True, use_ner_cache_only=False, ner_name=supported.ONTONOTES_BERT_MULT_NAME): assert (isinstance(init_ner, bool)) assert (isinstance(init_frames, bool)) assert (isinstance(init_stemmer, bool)) assert (isinstance(ner_name, str)) self.__auth_objects = None self.__use_ner_cache_only = use_ner_cache_only self.__synonyms = None self.__stemmer = None self.__frame_variants = None self.__frames = None self.__pos_tagger = None self.__syntax = None self.__use_auth_list = False self.__frames_cache = None # NER self.__ner_cache = None self.__ner_class_type = Default.get_class_by_ner_name(ner_name) self.__ner = None if init_stemmer: self.__stemmer = Default.create_default_stemmer() if self.__stemmer is not None: self.__pos_tagger = POSMystemWrapper(self.__stemmer.MystemInstance) if init_frames: self.__frames = Default.create_default_frames_collection( frames_collection_filepath) if self.__stemmer is not None and self.__frames is not None: self.__frame_variants = Default.create_default_frame_variants_collection( frames=self.__frames, stemmer=self.__stemmer) if self.__frame_variants is not None: self.__frames_helper = FramesHelper(self.__frame_variants) if init_ner and not use_ner_cache_only: self.__ner = self.__ner_class_type() self.__synonyms = Default.create_default_synonyms_collection( filepath=synonyms_collection_filepath, stemmer=None if self.DISABLE_LEMMA_FOR_SYNONYMS else self.__stemmer) self.__auth_objects = AuthorizedObjectsCollection(OrderedDict())
def expand_with_neutral(self, from_zip_filepath, cache_dir, log_filepath, used_locations_filepath, neut_opin_stat_filepath): assert (isinstance(from_zip_filepath, str)) assert (isinstance(cache_dir, str)) assert (isinstance(log_filepath, str)) assert (isinstance(used_locations_filepath, str)) assert (isinstance(neut_opin_stat_filepath, str)) # Clearing folder and filling with fresh data. os.system('rm -rf {cache_dir}'.format(cache_dir=cache_dir)) # Extract everything into cache_dir. with zipfile.ZipFile(from_zip_filepath, 'r') as zip_input: zip_input.extractall(cache_dir) # Reading synonyms collection. print("Reading synonyms collection ...") stemmer = Default.create_default_stemmer() synonyms_filepath = join(cache_dir, "synonyms.txt") synonyms = SynonymsCollection.from_file(synonyms_filepath, stemmer=stemmer) # Initialize all the related from synonyms collection information. self.__init_from_synonyms(synonyms) # Start neutral opinion annotation process. print("Run processing ...") source_filepath = join(cache_dir, "collection.txt") target_filepath = join(cache_dir, "collection-neut.txt") with open(source_filepath, 'r') as f_src: with open(target_filepath, 'w') as f_to: neut_logger = self.__process(f_src=f_src, f_to=f_to) # Replacing old file in cache dir with a new one. os.system('mv {} {}'.format(target_filepath, source_filepath)) # Saving everything in a new archive file. target_zip_filepath = join(dirname(from_zip_filepath), get_target_filename(from_zip_filepath)) if neut_logger is not None: with open(log_filepath, 'w') as f: for line in neut_logger.iter_data(): f.write(line) with open(used_locations_filepath, 'w') as f: for key, value in sorted(self.__used_locations.items(), key=lambda pair: pair[1]): f.write("'{entry}': {count}\n".format(entry=key, count=value)) self.__opin_stat_printer.print_statistic(neut_opin_stat_filepath) archive_all_files_in_zip(to_zip_filepath=target_zip_filepath, source_dir=cache_dir)
NewsSourceDirArg.add_argument(parser) SourceNewsReaderArg.add_argument(parser) NewsStartFromIndexArg.add_argument(parser) ParseFramesInSentencesArgs.add_argument(parser) RuSentiFramesCacheArgs.add_argument(parser) # Parsing arguments. args = parser.parse_args() # Reading arguments. src_dir = NewsSourceDirArg.read_argument(args) reader = SourceNewsReaderArg.read_argument(args) frames_filepath = RuSentiFramesCacheArgs.read_argument(args) parse_frames_in_sents = ParseFramesInSentencesArgs.read_argument(args) start_from_index = NewsStartFromIndexArg.read_argument(args) stemmer = Default.create_default_stemmer() frames = Default.create_default_frames_collection(frames_filepath) f_var = Default.create_default_frame_variants_collection(frames=frames, stemmer=stemmer) run_frames_cache(reader=reader, src_dir=src_dir, version=basename(frames_filepath), frames=frames, frames_helper=FramesHelper(f_var), stemmer=stemmer, parse_frames_in_sentences=parse_frames_in_sents, start_from_index=start_from_index, miniter_count=2000000)