def __init__(self, synonyms_collection_filepath, frames_collection_filepath, init_ner=True, init_stemmer=True, init_frames=True, use_ner_cache_only=False, ner_name=supported.ONTONOTES_BERT_MULT_NAME): assert (isinstance(init_ner, bool)) assert (isinstance(init_frames, bool)) assert (isinstance(init_stemmer, bool)) assert (isinstance(ner_name, str)) self.__auth_objects = None self.__use_ner_cache_only = use_ner_cache_only self.__synonyms = None self.__stemmer = None self.__frame_variants = None self.__frames = None self.__pos_tagger = None self.__syntax = None self.__use_auth_list = False self.__frames_cache = None # NER self.__ner_cache = None self.__ner_class_type = Default.get_class_by_ner_name(ner_name) self.__ner = None if init_stemmer: self.__stemmer = Default.create_default_stemmer() if self.__stemmer is not None: self.__pos_tagger = POSMystemWrapper(self.__stemmer.MystemInstance) if init_frames: self.__frames = Default.create_default_frames_collection( frames_collection_filepath) if self.__stemmer is not None and self.__frames is not None: self.__frame_variants = Default.create_default_frame_variants_collection( frames=self.__frames, stemmer=self.__stemmer) if self.__frame_variants is not None: self.__frames_helper = FramesHelper(self.__frame_variants) if init_ner and not use_ner_cache_only: self.__ner = self.__ner_class_type() self.__synonyms = Default.create_default_synonyms_collection( filepath=synonyms_collection_filepath, stemmer=None if self.DISABLE_LEMMA_FOR_SYNONYMS else self.__stemmer) self.__auth_objects = AuthorizedObjectsCollection(OrderedDict())
def __init__(self, settings, contexts_printer, opinion_statistic_printer, object_statistic_printer, parse_frames_in_news_sentences): assert(isinstance(settings, Settings)) assert(isinstance(contexts_printer, ContextsPrinter)) assert(isinstance(parse_frames_in_news_sentences, bool)) assert(isinstance(opinion_statistic_printer, OpinionStatisticBasePrinter)) assert(isinstance(object_statistic_printer, StatisticObjectsPrinter) or object_statistic_printer is None) self.__settings = settings self.__context_printer = contexts_printer self.__opinion_statistic_printer = opinion_statistic_printer self.__object_statistic_printer = object_statistic_printer self.__parse_frames_in_news_sentences = parse_frames_in_news_sentences self.__check_obj_preposition_in_title = True self.__text_object_authorizer = TextObjectAuthorizer(ner_type=settings.NERClassType) self.__ner_extractor = Default.create_ner_extractor( ner=settings.NER, ner_cache=settings.NerCache, default_auth_check=lambda text_obj: self.__text_object_authorizer.is_auth(text_obj)) self.__debug_opinions_created = 0 self.__debug_opinions_with_missed_synonyms = 0 self.__debug_opinions_looped = 0 self.__debug_opinions_total_extracted_from_titles = 0 self.__debug_opinions_rejected_by_preps = 0 self.__debug_opinions_title_synonymous_existed = 0
def __init__(self, ner_cache, stemmer, default_auth_check): assert(isinstance(stemmer, Stemmer)) self.__stemmer = stemmer self.__ner_extractor = Default.create_ner_extractor( ner=None, ner_cache=ner_cache, default_auth_check=default_auth_check)
def expand_with_neutral(self, from_zip_filepath, cache_dir, log_filepath, used_locations_filepath, neut_opin_stat_filepath): assert (isinstance(from_zip_filepath, str)) assert (isinstance(cache_dir, str)) assert (isinstance(log_filepath, str)) assert (isinstance(used_locations_filepath, str)) assert (isinstance(neut_opin_stat_filepath, str)) # Clearing folder and filling with fresh data. os.system('rm -rf {cache_dir}'.format(cache_dir=cache_dir)) # Extract everything into cache_dir. with zipfile.ZipFile(from_zip_filepath, 'r') as zip_input: zip_input.extractall(cache_dir) # Reading synonyms collection. print("Reading synonyms collection ...") stemmer = Default.create_default_stemmer() synonyms_filepath = join(cache_dir, "synonyms.txt") synonyms = SynonymsCollection.from_file(synonyms_filepath, stemmer=stemmer) # Initialize all the related from synonyms collection information. self.__init_from_synonyms(synonyms) # Start neutral opinion annotation process. print("Run processing ...") source_filepath = join(cache_dir, "collection.txt") target_filepath = join(cache_dir, "collection-neut.txt") with open(source_filepath, 'r') as f_src: with open(target_filepath, 'w') as f_to: neut_logger = self.__process(f_src=f_src, f_to=f_to) # Replacing old file in cache dir with a new one. os.system('mv {} {}'.format(target_filepath, source_filepath)) # Saving everything in a new archive file. target_zip_filepath = join(dirname(from_zip_filepath), get_target_filename(from_zip_filepath)) if neut_logger is not None: with open(log_filepath, 'w') as f: for line in neut_logger.iter_data(): f.write(line) with open(used_locations_filepath, 'w') as f: for key, value in sorted(self.__used_locations.items(), key=lambda pair: pair[1]): f.write("'{entry}': {count}\n".format(entry=key, count=value)) self.__opin_stat_printer.print_statistic(neut_opin_stat_filepath) archive_all_files_in_zip(to_zip_filepath=target_zip_filepath, source_dir=cache_dir)
NewsSourceDirArg.add_argument(parser) SourceNewsReaderArg.add_argument(parser) NewsStartFromIndexArg.add_argument(parser) ParseFramesInSentencesArgs.add_argument(parser) RuSentiFramesCacheArgs.add_argument(parser) # Parsing arguments. args = parser.parse_args() # Reading arguments. src_dir = NewsSourceDirArg.read_argument(args) reader = SourceNewsReaderArg.read_argument(args) frames_filepath = RuSentiFramesCacheArgs.read_argument(args) parse_frames_in_sents = ParseFramesInSentencesArgs.read_argument(args) start_from_index = NewsStartFromIndexArg.read_argument(args) stemmer = Default.create_default_stemmer() frames = Default.create_default_frames_collection(frames_filepath) f_var = Default.create_default_frame_variants_collection(frames=frames, stemmer=stemmer) run_frames_cache(reader=reader, src_dir=src_dir, version=basename(frames_filepath), frames=frames, frames_helper=FramesHelper(f_var), stemmer=stemmer, parse_frames_in_sentences=parse_frames_in_sents, start_from_index=start_from_index, miniter_count=2000000)
ner_type = NerTypeArg.read_argument(args) ner_cache_filepath = NerCacheFilepathArg.read_argument(args) output_dir = OutputDirArg.read_argument(args) source_dir = NewsSourceDirArg.read_argument(args) reader = SourceNewsReaderArg.read_argument(args) # Initializing ner cache. ner_cache = SQLiteNERCacheData.init_as_read_only(ner_cache_filepath) # Exporting results. news_processed = 0 added_words = set() f_name = "{}.txt".format(ner_type) # Init obj values extractor. ner_class_type = Default.get_class_by_ner_name(ner_type) text_object_authorizer = TextObjectAuthorizer(ner_type=ner_class_type) obj_values_extractor = TextObjectValuesExtractor( ner_cache=ner_cache, stemmer=Default.create_default_stemmer(), default_auth_check=lambda text_obj: text_object_authorizer.is_auth( text_obj)) create_dir(output_dir) print("Output dir: {}".format(output_dir)) with ner_cache: with open(join(output_dir, f_name), "w") as f: for _, news_info in reader.get_news_iter(source_dir): assert (isinstance(news_info, NewsInfo))
nargs='?', help='Source directory') # Added parameters. SynonymsCollectionFilepathArg.add_argument(parser) # Parsing arguments. args = parser.parse_args() # Readed parameters. opinion_filepath = args.opinion_filepath source_filepath = args.source_filepath synonyms_filepath = SynonymsCollectionFilepathArg.read_argument(args) opinion_filename = basename(opinion_filepath) stemmer = Default.create_default_stemmer() synonyms = Default.create_default_synonyms_collection( filepath=synonyms_filepath, stemmer=stemmer) with open(opinion_filepath, 'r') as f: opinions = read_opinions( filepath=opinion_filepath, synonyms=synonyms, custom_opin_ends_iter=lambda use_sentiment: OpinionStatisticBasePrinter.iter_opinion_end_values( f=f, read_sentiment=use_sentiment), read_sentiment=False) file_ids_it = iter_relevant_file_ids(source_filepath=source_filepath, opinions=opinions)
parser.add_argument('--obj-values-dir', dest='obj_values_dir', type=str, nargs=1, help='Source dir') # Parse arguments. OptionalOutputDirArg.add_argument(parser) # Reading arguments. args = parser.parse_args() source_dir = args.obj_values_dir[0] output_dir = OptionalOutputDirArg.read_argument(args) # Initialize necessary instances for words grouping. stemmer = Default.create_default_stemmer() ruthes_nouns = RussianThesaurusSynsets.from_xml_file(filepath=args.ruthes_filepath[0]) log_found_in_ruthes = 0 log_lemmas_kept = 0 syn_groups = {} # Processsing all the files in subdir. f_names_it = get_all_subfiles(data_folder=source_dir, f_name_check_rule=lambda _: True) for filename in f_names_it: print(filename) for obj_value, obj_type in iter_words_with_types_from_filepath(filename): if obj_value in ruthes_nouns: log_found_in_ruthes += 1