示例#1
0
 def find_usage_examples_from_summary(
     self,
     form: Form = None,
 ) -> List[UsageExample]:
     """This tries to find and clean sentences and return the shortest one"""
     if form is None:
         raise ValueError("form was None")
     logger = logging.getLogger(__name__)
     # find sentences
     # order in a list by length
     # pick the shortest one where the form representation appears
     logger.debug("Splitting the sentences using spaCy")
     nlp = Swedish()
     nlp.add_pipe('sentencizer')
     doc = nlp(self.text)
     sentences = set()
     raw_sentences = list(doc.sents)
     logger.info(f"Got {len(raw_sentences)} sentences from spaCy")
     for sentence in raw_sentences:
         #logger.info(sentence.text)
         # This is a very crude test for relevancy, we lower first to improve matching
         cleaned_sentence = sentence.text.lower()
         punctations = [".", ",", "!", "?", "„", "“", "\n"]
         for punctation in punctations:
             if punctation in cleaned_sentence:
                 cleaned_sentence = cleaned_sentence.replace(
                     punctation, " ")
         logger.debug(f"cleaned sentence:{cleaned_sentence}")
         if f" {form.representation.lower()} " in f" {cleaned_sentence} ":
             # Add to the set first to avoid duplicates
             sentences.add(sentence.text.replace("\n", "").strip())
     logger.info(
         f"Found {len(sentences)} sentences which contained {form.representation}"
     )
     examples = []
     count_discarded = 0
     for sentence in sentences:
         sentence_length = len(sentence.split(" "))
         if (sentence_length > config.min_word_count
                 and sentence_length < config.max_word_count):
             examples.append(UsageExample(sentence=sentence, record=self))
         else:
             count_discarded += 1
     if count_discarded > 0:
         logger.info(
             f"{count_discarded} sentence was discarded based on length")
     #print("debug exit")
     #exit(0)
     return examples
示例#2
0
def input_fn(file_name):
    train_data = []
    with open(file_name, 'r', encoding='utf-8') as f:
        heads = []
        deps = []
        annotations = []
        texts = []
        for line in f.readlines():
            if re.match("^# text = ", line):
                texts.append(line.lstrip("# text = ").rstrip('\n'))
            elif re.match("^#", line):
                del line
            elif not line.strip() == "":
                sent = line.lstrip()
                lines = [line.split('\t') for line in sent.split('\n')][0]
                if lines[6] == "_":
                    del lines
                else:
                    heads.append(int(lines[6]))
                    deps.append(lines[7])
            elif line.strip() == "":
                annotations.append([heads, deps])
                heads = []
                deps = []

        for i in range(len(annotations)):
            # Encode per-token tags following the BILUO scheme into entity offsets.
            text = texts[i]
            heads, deps = annotations[i][0], annotations[i][1]
            nlp = Swedish()
            spacy_doc = nlp(text)
            train_format = (str(spacy_doc), {'heads': (heads), 'deps': (deps)})
            train_data.append(train_format)
    return train_data
示例#3
0
def create_model(vectors_loc=None,
                 lang=None,
                 stz=True,
                 vectors_name='fasttext',
                 max_items=-1):
    if lang is None or lang == 'sv' and not stz:
        nlp = Swedish()
    elif not stz:
        nlp = spacy.blank(lang)
    elif stz:
        stanza.download(lang)
        snlp = stanza.Pipeline(lang=lang)
        nlp = StanzaLanguage(snlp)

    with open(vectors_loc, 'rb') as file_:
        logger.info("Reading file '{}'".format(vectors_loc))
        header = file_.readline()
        nr_row, nr_dim = header.split(
        )  # the first line is number of tokens and dimensions
        counter = 0
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            if counter % 100 == 0:
                logger.info(counter)
            if counter == max_items:
                break
            counter = counter + 1
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
        nlp.vocab.vectors.name = vectors_name  # give vectors a name
    return nlp
示例#4
0
def lemmatization_sv(text):
	"""
	Returnerar en lista med orden i givna strängen i grundform.
	"""
	nlp = Swedish()
	tokenized = nlp(text)
	lemmalized = [token.lemma_ for token in tokenized]
	lemmalized_words_string = ' '.join(lemmalized)
	return lemmalized_words_string
示例#5
0
文件: test_sv.py 项目: cs394-s20/Aqua
def sv_nlp():
    return Swedish()
data_words_nostops = remove_stopwords(cleaned_list)

wrdCloud = ''
#strr=[]
for row in data_words_nostops:
    #    strr.append(",".join(row))
    wrdCloud += " ".join(row)
texts = ""
for row in data_words_nostops:
    texts += ','
    texts += ','.join(row)

# spacy for lemmatization
from spacy.lang.sv import Swedish
nlp = Swedish()
doc_lemmatized = nlp(texts)
##print('Tags', [(t.text) for t in doc_lemmatized])
ftext = []
for wrd in doc_lemmatized:
    coma = ','
    if (wrd.text != ','):
        ftext.append(wrd.text)
#    words = " ".join(re.findall("[a-z\såäö]+", line[1]))
#    topic_words.append(words.split())

#RQ2 implementation start here
retweeted = []
for row in tweet_list:
    if row.startswith('RT'):
        retweeted.append(row)
示例#7
0
# ### How about other languages?

# In[32]:


doc = nlp("jag heter nils")
displacy.render(doc, jupyter=True)


# ### test swedish

# In[33]:


from spacy.lang.sv import Swedish
nlp = Swedish()  # use directly
#nlp = spacy.blank("sv")  # blank instance


# ### basic tokenization enabled
# - but not syntactic parsing

# In[34]:


doc = nlp("jag heter nils")
for token in doc:
    print(token.text, token.pos_, token.dep_)
#displacy.render(doc, jupyter=True)

示例#8
0
 def find_usage_examples_from_summary(
     self,
     form: Form = None,
 ) -> List[UsageExample]:
     """This tries to find and clean sentences and return the shortest one"""
     if form is None:
         raise ValueError("form was None")
     logger = logging.getLogger(__name__)
     # find sentences
     # order in a list by length
     # pick the shortest one where the form representation appears
     if self.language_code == WikimediaLanguageCode.ENGLISH:
         logger.info("using the English spaCy pipeline")
         nlp = English()
         nlp.add_pipe('sentencizer')
     elif self.language_code == WikimediaLanguageCode.SWEDISH:
         nlp = Swedish()
         nlp.add_pipe('sentencizer')
     elif (self.language_code == WikimediaLanguageCode.FRENCH
           or self.language_code == WikimediaLanguageCode.GERMAN
           or self.language_code == WikimediaLanguageCode.BOKMÅL
           or self.language_code == WikimediaLanguageCode.DANISH):
         logger.info(
             f"using the {self.language_code.name.title()} spaCy pipeline")
         try:
             nlp = spacy.load(f'{self.language_code.value}_core_news_sm')
         except:
             raise ModuleNotFoundError(
                 f"Please install the spacy model for "
                 f"{self.language_code.name.title()} by running: "
                 f"'python -m spacy download "
                 f"{self.language_code.value}_core_news_sm' "
                 f"in the terminal/cmd/powershell")
     else:
         raise NotImplementedError(
             f"Sentence extraction for {self.language_code.name} "
             f"is not supported yet, feel free to open an issue at "
             f"https://github.com/dpriskorn/LexUtils/issues")
     doc = nlp(self.text)
     sentences = set()
     for sentence in doc.sents:
         # logger.info(sentence.text)
         # This is a very crude test for relevancy, we lower first to improve matching
         cleaned_sentence = sentence.text.lower()
         punctations = [".", ",", "!", "?", "„", "“", "»"]
         for punctation in punctations:
             if punctation in cleaned_sentence:
                 cleaned_sentence = cleaned_sentence.replace(
                     punctation, " ")
         cleaned_sentence = cleaned_sentence.strip()
         logger.debug(f"cleaned sentence:{cleaned_sentence}")
         if f" {form.representation.lower()} " in cleaned_sentence:
             # Add to the set first to avoid duplicates
             sentences.add(sentence.text)
     examples = []
     for sentence in sentences:
         sentence_length = len(sentence.split(" "))
         if (sentence_length > config.min_word_count
                 and sentence_length < config.max_word_count):
             # Clean the sentence so it looks better
             punctations = ["„", "“", "»"]
             for punctation in punctations:
                 if punctation in sentence:
                     sentence = sentence.replace(punctation, " ")
             sentence = sentence.strip()
             examples.append(UsageExample(sentence=sentence, record=self))
     # print("debug exit")
     # exit(0)
     return examples