def langage_detection(self):
     """detect language"""
     self.nlp.add_pipe(LanguageDetector(),
                       name='language_detector',
                       last=True)
     doc = self.nlp(self.phrase)
     return doc._.language
Пример #2
0
def create_spacy_nlp_object(parameters: Dict[str, Any]) -> Language:
    nlp = spacy.load(parameters['spacy_lang'])
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

    # Modify tokenizer
    suffixes = list(nlp.Defaults.suffixes)
    # remove dot as suffix
    suffixes.append(r"\.")
    suffix_regex = spacy.util.compile_suffix_regex(suffixes)
    nlp.tokenizer.suffix_search = suffix_regex.search

    # modify tokenizer infix patterns
    infixes = (
        LIST_ELLIPSES + LIST_ICONS + [
            # EDIT: Removed hyphen \- : r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[0-9])[+\*^](?=[0-9-])",
            r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
            r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
        ])
    infix_re = spacy.util.compile_infix_regex(infixes)
    nlp.tokenizer.infix_finditer = infix_re.finditer

    return nlp
Пример #3
0
def lang_distribution():
    print("loading dataset..")
    with open('bgg_download/data/boardgames-data/bgg-data-cleaned.json',
              'r',
              encoding="utf-8") as f:
        data = json.load(f)
    print("dataset loaded")

    print("loading spacy en model...")
    nlp = spacy.load("en_core_web_lg")
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
    print("model loaded")

    game_lang_dict = {}
    for item in data["items"]:
        for com in item["comments"]:
            comment = nlp(mt.pre_processing(com["value"].lower()))
            comment_lang, comment_lang_score = comment._.language.values()
            if comment_lang not in game_lang_dict.keys():
                game_lang_dict[comment_lang] = 1
            else:
                game_lang_dict[comment_lang] += 1

    with open("bgg_result/games_lang_distr.json", 'w') as out:
        json.dump(game_lang_dict, out)
def check_language(check_text):
    # load language model
    lang = spacy.load("en")
    lang.add_pipe(LanguageDetector(), name="language_detector", last=True)
    doc_check = lang(check_text)
    if doc_check._.language["language"] == "en":
        return True
def spacy_language_detection(row):
    """
     Function utilizes spaCy N.L.P. library, "langdetect" library, and "spacy-langdetect" library
     to determine the language of the Tweet.
    :param row: example in the dataset we are operating on.
    :return:  the modified example with additional column specifying its language.
    """
    global non_english_count_global

    nlp = spacy.load("en")
    nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    document = nlp(row["tweet_full_text"])
    # document level language detection. Think of it like average language of document!
    text_language = document._.language
    row["spaCy_language_detect"] = str(text_language["language"])
    print("spaCy language designation:")
    print(str(text_language["language"]))

    if not str(text_language["language"]).startswith('en'):
        non_english_count_global += 1
        log.warning(f"\t\t\tnon-English tweet (will be dropped): "
                    f"\n\t\t\t\tid: {row['tweet_id']}"
                    f"\n\t\t\t\ttweet: {row['text_derived']}"
                    f"\n\t\t\t\tLanguage tags: {row['spaCy_language_detect']}")
    return row["spaCy_language_detect"]
Пример #6
0
 def __init__(self, input_csv, col_names=None, min_review=100,
              remove_non_english=False):
     review_df = pd.read_csv(input_csv)
     # Drop Duplicate Rows
     # Unfortunately, the Indeed web scraper scrapes the same 'top' review
     # for every page, leading to duplicates, must drop identical rows
     # until this is fixed
     review_df = review_df.drop_duplicates()
     # We have standard column names, if your csv does not match these
     # standard names, you must supply a dictionary that translates
     if col_names is not None:
         if len(review_df.columns) < len(col_names):
             raise Exception('The number of column names supplied cannot'
                             ' exceed the number of columns in dataframe')
         else:
             review_df.rename(columns=col_names, inplace=True)
     # Remove companies with less than 'min_reviews' threshold
     company_counts = review_df['Company'].value_counts()
     company_thres = company_counts[company_counts >= min_review].index
     temp_boolean = review_df['Company'].apply(lambda x: x in company_thres)
     self.reviews = review_df.loc[temp_boolean, :]
     # If user specifies, detect non-english reviews, and remove
     # Wayyyy to slow right now
     if remove_non_english:
         print('Removing non-english reviews')
         # Using spacy and 'langdetect' package from spacy
         import spacy
         from spacy_langdetect import LanguageDetector
         # Disable extraneous components of spacy pipeline to speed up
         nlp = spacy.load('en_core_web_md', disable=['ner', 'tagger', 'textcat'])
         # Add language detector
         nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
         temp_boolean = review_df['Review'].apply(self.language_detect, nlp_pipe=nlp)
         # Remove non-english reviews
         self.reviews = self.reviews.loc[temp_boolean, :]
Пример #7
0
def detect_language(uuid):
    languages_detected = []
    # * ---------- PATH --------- *
    # Path of the this file
    path_file = os.path.dirname(os.path.realpath(__file__))
    # Path to WALK txt
    path_txt_dir_walk = f'{path_file}/{uuid}/txt/'

    # Detect language of all the txt files and put them in languages_detected array
    for r, d, f in os.walk(path_txt_dir_walk):
        for txt in f:
            if '.txt' in txt:
                # Read the txt file to get the text
                with codecs.open(path_txt_dir_walk + txt, 'r',
                                 'utf-8') as file:
                    txt_content = file.read()
                # Load Spacy
                nlp = spacy.load("en_core_web_sm")
                # Add language detection to it
                nlp.add_pipe(LanguageDetector(),
                             name="language_detector",
                             last=True)
                # Apply NLP on text
                txt_nlp = nlp(txt_content)
                # Target the language part of the nlp done
                txt_lang = txt_nlp._.language
                # Add the language detected to to languages_detected array
                languages_detected.append(txt_lang['language'])
    if 'fr' in languages_detected:
        is_languague_is_french = True
    else:
        is_languague_is_french = False

    return is_languague_is_french
Пример #8
0
def prune_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create simple features such as number of words, length of characters in order to
    prune dataset more
    """
    # create number of words column
    df['num_words'] = df['DESCRIPTION'].apply(lambda x: len(x.split()))

    # from exploratpry data analysis, sentences form with 4 or more words,
    # drop entries with less than 3 words
    df = df[df['num_words'] > 3]

    # load in spacy & language detector
    nlp = spacy.load('en')
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

    # simple function to return detected language
    def get_lang(desc):
        doc = nlp(desc)
        return doc._.language['language']

    df['lang'] = df['DESCRIPTION'].apply(get_lang)

    # hindi and indonesian languages are most common, remove these for better nlp processing
    df = df[~df['lang'].isin(['hi', 'id'])]

    return df.reset_index(drop=True)
def language_detector():
    """
    Sort .txt files by document average language, by moving them into respective subfolders, implemented using SpaCy.
    Subfolders with the language as the name of the folders are created for every unseen language.
    Supports multiple languages. 
    """
    curr_dir = os.getcwd()
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

    for filename in os.listdir(curr_dir):
        if filename.endswith(".txt"):
            f = open(filename, "r", encoding="utf8")
            text = f.read()
            f.close()
            doc = nlp(text)
            lang = doc._.language['language']
            if not os.path.exists(os.path.join(curr_dir, lang)):
                os.makedirs(os.path.join(curr_dir, lang))
            else:
                print(f"Moving {filename} to {lang}")
                os.replace(os.path.join(curr_dir, filename),
                           os.path.join(curr_dir, lang, filename))
        else:
            continue
Пример #10
0
def preprocessing():

    # Deutsches und Französisches Sprachmodell laden
    nlp_de = spacy.load("de_core_news_md")
    nlp_fr = spacy.load("fr_core_news_sm")

    # Language Detector laden
    nlp_de.add_pipe(LanguageDetector(), name="language_detector", last=True)

    # Den zu untersuchenden Text öffen
    with open("../data_in/rieger.txt", encoding="utf-8") as file:
        text = file.read()
    # dem Text wird ein deutsches Sprachmodell zugewiesen
    # --> wird gleich geändert, erst Spracherkennung, dann Sprachmodell de oder fr

    doc = nlp_de(text)

    # Bestimmung der Sprache auf Dokumentebene
    print(doc._.language)

    dict_fr = {}
    dict_de = {}
    dict_all = {}
    dict_sonstiges = {}

    for sent in doc.sents:
        dict_all[sent] = sent._.language

    print(dict_all)

    print(type(doc.sents))

    print(dict_all)

    for i in dict_all:
        if "fr" in dict_all[i]['language']:
            dict_fr[i] = dict_all[i]
        if "de" in dict_all[i]['language']:
            dict_de[i] = dict_all[i]
        else:
            dict_sonstiges = dict_all[i]
    # print(dict_fr)
    # print(dict_de)
    # print(dict_sonstiges)

    ##############################
    text_fr = []
    file = open("../data_out/spacy_lfr.txt", "w", encoding="utf-8")

    for j, k in dict_fr.items():
        print("Text:", j)
        file.write(str(j))
        for m in k:
            print(m + ":", k[m])

    file2 = open("../data_out/spacy_lde.txt", "w", encoding="utf-8")

    for n, o in dict_de.items():
        print("Text:", n)
        file2.write(str(n))
def test_custom_language_detector():
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(LanguageDetector(language_detection_function=lambda spacy_object: "from custom function"), name="language_detector", last=True)
    text = "This is a test"
    doc = nlp(text)
    assert doc._.language == "from custom function"
    for i, sent in enumerate(doc.sents):
        assert sent._.language == "from custom function"
def test_tokens():
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    text = "English Hello"
    doc = nlp(text)
    languages = []
    for i, token in enumerate(doc):
        languages.append(token._.language["language"])
    assert len(languages) == 2
def test_language_detector():
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    text = "This is English text. Er lebt mit seinen Eltern und seiner Schwester in Berlin. Yo me divierto todos los días en el parque. Je m'appelle Angélica Summer, \
     j'ai 12 ans et je suis canadienne."
    doc = nlp(text)
    doc._.language["language"]
    for i, sent in enumerate(doc.sents):
        sent._.language["language"]
Пример #14
0
 def __init__(self, corpus):
     self.corpus = [{'id': _id, 'doc': doc} for _id, doc in corpus]
     self.nlp = spacy.load('en_core_web_sm')
     self.nlp.add_pipe(LanguageDetector(),
                       name='language_detector',
                       last=True)
     nltk.download('wordnet')
     self.lemmatizer = WordNetLemmatizer()
     self.table = str.maketrans("", "", string.punctuation)
Пример #15
0
    def predict(self, context, model_input):
        if self.nlp is None:
            self.nlp = spacy.load('en_core_web_sm')
            self.nlp.add_pipe(LanguageDetector(),
                              name='language_detector',
                              last=True)

        return model_input[model_input.columns[0]].apply(
            lambda x: self.nlp(x)._.language)
Пример #16
0
    def what_language(df):

        nlp = spacy.load('en')
        nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

        df['language'] = df['article'].apply(
            lambda x: nlp(x)._.language['language'])

        #we can really only analyze English articles
        df = df[df['language'] == 'en']
Пример #17
0
def determine_language(corpus, spc_obj):
    """Determines the language of the first five lines of the corpus."""
    spc_obj.add_pipe(LanguageDetector(), name='language_detector', last=True)
    doc = ''
    for line in corpus[:5]:
        doc += (line + ' ')
    lang = spc_obj(doc)._.language['language']
    if lang != 'es':
        spc_obj = spacy.load(supported_languages[lang])
    return (spc_obj, lang)
Пример #18
0
def detect_language(texte_string):
    # Load english in spacy
    nlp = spacy.load("en_core_web_sm")
    # Add the language detection
    nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    # Apply NLP on the string
    string_with_nlp = nlp(texte_string)
    # Target the language NLP feature
    language_detected = string_with_nlp._.language
    return language_detected['language']
 def __init__(self, 
              model_type='',
              stopwords_file=''
             ):
     
     self._nlp=spacy.load(model_type)
     self._nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
     
     #Representado como diccionario para tener un tiempo de acceso menor.
     self._stopwords=self._load_stopwords(file=stopwords_file)
def main():
    ######## defining the parameters ##############
    supported_languages = ["English", "German",
                           "Spanish", "Portuguese", "French", "Italian"]
    # this is required, otherwise we get weird languages for long and untidy documents
    
    default_language = "English"
	# making English the default, which is used when no language is detected
	
    useful_characters = string.printable + \
        'äöüÄÖÜéÉèÈáÁàÀóÓòÒúÚùÙíÍìÌñÑãÃõÕêÊâÂîÎôÔûÛ'  # filtering the characters of the texts
	
    parsable_extensions = ['.csv', '.doc', '.docx', '.eml', '.epub', '.json',
                           '.msg', '.odt', '.ogg', '.pdf', '.pptx', '.rtf', '.xlsx', '.xls']
    """ '.gif', '.jpg', '.mp3', '.tiff', '.wav', '.ps', '.html' """
    # the extensions which we try to parse to text
	
    doc_maxlength = 2000000  # default would be 1m which is the maximum length of a document in spacy
	
    minlength_of_text = 100  # if textlen is lower, we ignore this text
	
    POS_blacklist = ["PUNCT", "PART", "SYM", "SPACE",
                     "DET", "CONJ", "CCONJ", "ADP", "INTJ", "X", ""]  # we filter out these token-types
    
    parsers = [titlecaps, token_replacement, url_replacement] # the parsing functions used
	
    path = get_path(parsable_extensions) # Determining the directory from which to import documents

	
    ######## initiating the pipelines ##############
    multilanguage, nlp = decide_language_detection(
        path, supported_languages, default_language)
	# let the user determine if he wants to use the sentence-wise
    # language detection or the document-wise. The sentence-wise allows
    # to ignore parts of docs that contain text not of interest, such
    # as metadata in english for a german document

    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
	# add the language detector to the spacy nlp pipeline

    pdf_to_text(path, parsable_extensions)
	# safe all non-text-documents with parsable extensions to txt-file

    doc_list = documents_dataframe(path, minlength_of_text, doc_maxlength,
                                   nlp, multilanguage, default_language, supported_languages, parsers, useful_characters)
	# create a document list with detected language, filename, textname and text
	
    df_doclist = get_all_text_info(
        doc_list, supported_languages, POS_blacklist, doc_maxlength)
	# use the document list to retrieve various basic information from the text 

    print(df_doclist.shape)
    df_doclist.to_pickle(path+"/df_doclist.pkl") # saving the data frame to path
	df_doclist = pd.read_pickle("./df_doclist.pkl") # and opening it
Пример #21
0
def check_language(input_text):
    """
    Check the language of an input text
    :param input_text:
    :return: the name of the language
    """
    NLP_dect.add_pipe(LanguageDetector(), name='language_detector', last=True)
    language = NLP_dect(input_text)._.language
    print(f"Text language: {language}")

    return language['language']
Пример #22
0
 def languageDistribution(df):
     nlp = spacy.load("en")
     nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
     df['language'] = ''
     language = []
     for index, row in df.iterrows():
         text = row[str(commentTextColumn)]
         doc = nlp(text)
         language.append(str(doc._.language['language']))
     df['language'] = language
     return df
Пример #23
0
def spacy_classifier(texts, lowercase=True, langs=['en', 'fr']):

    nlp = spacy.load(corpus_name)
    nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    if lowercase:
        detected_langs = [langs[0] if nlp(str(text).lower())._.language['language'] == langs[0] else langs[1]
                          for text in texts]
    else:
        detected_langs = [langs[0] if nlp(str(text))._.language['language'] == langs[0] else langs[1]
                          for text in texts]

    return detected_langs
Пример #24
0
def titles_cleanup(img_entity_pickle, out_pickle=None):
    with open(img_entity_pickle, 'rb') as pf:
        imgs_web_entity = pickle.load(pf)
    title_map = imgs_web_entity['title_map']

    snlp = spacy.load("en_core_web_lg")
    snlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

    all_titles = []
    all_title_idx = []
    all_split_idx = []
    for id, imgs_titles in title_map.items():
        for n, img_titles in imgs_titles.items():
            for t in img_titles:
                all_titles.append(t.lower())
                all_title_idx.append(id)
                all_split_idx.append(n)

    assert len(all_titles) == len(
        all_title_idx), f"{len(all_titles)} != {len(all_title_idx)}"

    pipe = snlp.pipe(all_titles)
    clean_title_map = defaultdict(lambda: defaultdict(list))
    all_clean_title = []
    drop_by_lanu = 0

    for i, doc in enumerate(pipe):
        if doc._.language['language'] != 'en':
            drop_by_lanu += 1
            continue

        id = all_title_idx[i]
        senten = ''
        for token in doc:
            if token.pos_ != 'NUM' and token.pos_ != 'X':
                senten += token.text.lower() + ' '
        for b in noun_chunk_blist:
            senten = re.sub(b, '', senten)
        for b in entity_black_list:
            senten = re.sub(b, '', senten)

        n_split = all_split_idx[i]
        # if len(clean_title_map[id]) < n_split + 1:
        #     clean_title_map[id] += [[] for _ in range(n_split + 1 - len(clean_title_map[id]))]
        clean_title_map[id][n_split].append(senten)
        all_clean_title.append(senten)

    imgs_web_entity['clean_title_map'] = dict(clean_title_map)
    if out_pickle is None:
        out_pickle = img_entity_pickle
    with open(out_pickle, mode='wb') as pf:
        pickle.dump(imgs_web_entity, pf)
Пример #25
0
 def __init__(self, lang: Languages):
     self.lang = lang
     self.nlp = self.load_spacy_model()
     # Add the language detector. It'll be turned off for normal tokenizing
     self.nlp.add_pipe(LanguageDetector(),
                       name='language_detector',
                       last=True)
     # Add all the custom exceptions
     exceptions = tokenizer_exceptions.get(self.lang, {})
     for term, exception in exceptions.items():
         self.nlp.tokenizer.add_special_case(term, exception)
     # Add custom function
     self.fn = custom_functions.get(self.lang, fn)
Пример #26
0
    def __post_init__(self):
        self.nlp = spacy.load(self.language_model)
        self.nlp.add_pipe(LanguageDetector(),
                          name='language_detector',
                          last=True)

        # Add the abbreviation pipe to the spacy pipeline. Only need to run this once.
        abbreviation_pipe = AbbreviationDetector(self.nlp)
        self.nlp.add_pipe(abbreviation_pipe)

        # Our linker will look up named entities/concepts in the UMLS graph and normalize the data
        # for us.
        self.linker = UmlsEntityLinker(resolve_abbreviations=True)
        self.nlp.add_pipe(self.linker)
def what_language(row):
    """
     Function utilizes spaCy N.L.P. library, "langdetect" library, and "spacy-langdetect" library
     to determine the language of the Tweet.
    :param row: example in the dataset we are operating on.
    :return:  the modified example with additional column specifying its language.
    """
    nlp = spacy.load("en")
    nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    document = nlp(row["text_derived"])
    # document level language detection. Think of it like average language of document!
    text_language = document._.language
    row["spaCy_language_detect"] = str(text_language["language"])
    return row["spaCy_language_detect"]
Пример #28
0
    def __init__(self,
                 tokenizer_type: str = 'bert-base-uncased',
                 do_lower_case: bool = True):
        self.nlp = spacy.load('en')
        self.nlp.add_pipe(LanguageDetector(),
                          name='language_detector',
                          last=True)
        self.toke = BertTokenizer.from_pretrained(tokenizer_type,
                                                  do_lower_case=do_lower_case)

        # For splitting by \n\n followed by
        # [... , (...) , {... , int... , ...: , or (R/r)epeat...
        self.header_seed = '(\n\n(\[.*|\(.*\)|\{.*|[0-9].*|.*[:|: ]\n|.*(R|r)epeat.*))'
        # For cleaning up any missed characters
        self.clean_seed = '\([^)].*\)|\[.*?\]|\(|\)|\[|\]|:'
def _index_warc(filename, index, counter):
    """
    Index individual WARC file.

    :param filename: WARC file name
    :param index: Elasticsearch index
    :param counter: Spark counter
    """
    try:
        nlp = spacy.load('en_core_web_sm')
        nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
        helpers.bulk(util.get_es_client(),
                     _generate_docs(index, filename, nlp, counter))
    except Exception as e:
        logger.error(e)
    def __init__(self, data_dir: str):
        '''Initializes a CORD-19 data preprocessing class
        
        Args:
            data_dir: Raw data directory
        '''
        self.data_dir = data_dir

        # Initialize NLP model
        self.nlp = en_core_sci_lg.load(disable=["tagger", "ner"])
        self.nlp.max_length = 2000000
        self.nlp.add_pipe(LanguageDetector(),
                          name='language_detector',
                          last=True)
        self.nlp_words_to_check = 100