Пример #1
0
def get_character_ngram_distribution(text, n, lowercase=False, stopword=False):
    """
    Get character distribution of text, potentially lowercasing and stopwording first.
    N.B. This method does not include or count whitespace.

    :param text:
    :param lowercase:
    :param stopword:
    :return:
    """
    # Return structure
    character_ngram_distribution = collections.defaultdict(int)

    # Iterate through tokens
    for token in get_token_list(text, lowercase=lowercase, stopword=stopword):
        for char_seq in nltk.ngrams(token, n):
            character_ngram_distribution[char_seq] += 1

    return character_ngram_distribution
Пример #2
0
def get_definitions_in_sentence(sentence: str,
                                return_sources=False,
                                decode_unicode=True) -> Generator:
    """
        Find possible definitions in natural language in a single sentence.
        :param decode_unicode:
        :param return_sources: returns a tuple with the extracted term and the source sentence
        :param sentence: an input sentence
        :return:
        """

    result = set()
    case1_terms = set()

    if decode_unicode:
        sentence = unidecode.unidecode(sentence)

    # case 1
    for item in TRIGGER_WORDS_PTN_RE.findall(sentence):
        result.update(EXTRACT_PTN_RE.findall(item))
        case1_terms.update(EXTRACT_PTN_RE.findall(item))

    # case 2
    result.update(PAREN_PTN_RE.findall(sentence))

    # case 3
    result.update(NOUN_PTN_RE.findall(sentence))

    # case 4
    result.update(COLON_PTN_RE.findall(sentence))

    # return result
    for term in result:
        if term not in case1_terms and len(
                get_token_list(term)) > MAX_TERM_TOKENS:
            continue
        if return_sources:
            yield (term, sentence)
        else:
            yield term
    def parse(self,
              log: ProcessLogger,
              text,
              text_unit_id,
              _text_unit_lang,
              document_initial_load: bool = False,
              **kwargs) -> ParseResults:
        project_id = kwargs.get('project_id')
        term_stems = dict_data_cache.get_term_config(project_id)
        text_stems = ' %s ' % ' '.join(get_stems(text, lowercase=True))
        text_tokens = get_token_list(text, lowercase=True)
        term_usages = []
        for stemmed_term, data in term_stems.items():
            # stem not found in text
            if stemmed_term not in text_stems:
                continue
            # if stem has 1 variant only
            if data['length'] == 1:
                count = text_stems.count(stemmed_term)
                if count:
                    term_data = list(data['values'][0])
                    term_data.append(count)
                    term_usages.append(term_data)
            # case when f.e. stem "respons" is equal to multiple terms
            # ["response", "responsive", "responsibility"]
            else:
                for term_data in data['values']:
                    term_data = list(term_data)
                    count = text_tokens.count(term_data[0])
                    if count:
                        term_data.append(count)
                        term_usages.append(term_data)
                        # TODO: "responsibilities"

        return ParseResults({
            TermUsage: [
                TermUsage(text_unit_id=text_unit_id, term_id=pk, count=count)
                for _, pk, count in term_usages
            ]
        })
Пример #4
0
    def train_doc2vec_model(self, data) -> gensim.models.doc2vec.Doc2Vec:
        """
        Train doc2vec model from queryset values

        :param data: training data - iterable set of texts
        :return: Doc2Vec trained model
        """
        doc2vec_data = []
        for index, text in enumerate(data):
            if not text:
                continue
            # Get tokens with LexNLP
            text_tokens = get_token_list(text, stopword=True, lowercase=True)
            # Append gensim object
            doc2vec_data.append(
                gensim.models.doc2vec.TaggedDocument(text_tokens, [index]))

        if not doc2vec_data:
            raise RuntimeError(
                'Empty data set, unable to create Doc2Vec model.')

        # Train model
        try:
            doc2vec_model = gensim.models.doc2vec.Doc2Vec(
                doc2vec_data,
                vector_size=self.vector_size,
                window=self.window,
                dm=self.dm,
                min_count=self.min_count,
                workers=1)
            # finished training a model (=no more updates, only querying), reduce memory usage
            doc2vec_model.delete_temporary_training_data(
                keep_doctags_vectors=True, keep_inference=True)
        except Exception as e:
            raise RuntimeError(
                'Bad data set, unable to create Doc2Vec model.') from e

        return doc2vec_model
Пример #5
0
def normalize_text(text: str,
                   spaces_on_start_end: bool = True,
                   spaces_after_dots: bool = True,
                   lowercase: bool = True,
                   use_stemmer: bool = False,
                   simple_tokenization: bool = False) -> str:
    """
    Normalizes text for substring search operations - extracts tokens, joins them back with spaces,
    adds missing spaces after dots for abbreviations, e.t.c.
    Overall aim of this method is to weaken substring matching conditions by normalizing both the text
    and the substring being searched by the same way removing obsolete differences between them
    (case, punctuation, ...).
    :param text:
    :param spaces_on_start_end:
    :param spaces_after_dots:
    :param lowercase:
    :param simple_tokenization: don't use nltk, just split text by space characters
    :param use_stemmer: Use stemmer instead of tokenizer. When using stemmer all words will be converted to singular
    number (or to some the most plain form) before matching. When using tokenizer - the words are compared as is.
    Using tokenizer should be enough for searches for entities which exist in a single number in the real world -
    geo entities, courts, .... Stemmer is required for searching for some common objects - table, pen, developer, ...
    :return: "normazlied" string
    """
    if use_stemmer:
        tokens = get_stem_list(text, lowercase=lowercase)
    elif simple_tokenization:
        tokens = reg_space.split(text)
        if lowercase:
            tokens = [t.lower() for t in tokens]
    else:
        tokens = get_token_list(text, lowercase=lowercase)
    res = ' '.join(tokens)
    if spaces_on_start_end:
        res = ' ' + res + ' '
    if spaces_after_dots:
        res = res.replace('.', ' . ').replace('  ', ' ')
    return res
Пример #6
0
def get_persons(text,
                strict=False,
                return_source=False,
                window=2) -> Generator:
    """
    Get names from text.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))
        companies = list(get_company_annotations(text))

        # Iterate through chunks
        persons = []
        last_person_pos = None

        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if type(chunk) == nltk.tree.Tree:
                # Check label
                if chunk.label() == 'PERSON':
                    if not strict and last_person_pos is not None and (
                            i - last_person_pos) < window:
                        persons[-1] += " " + " ".join([c[0] for c in chunk])
                    else:
                        persons.append(" ".join([c[0] for c in chunk]))
                    last_person_pos = i
            elif not strict and last_person_pos is not None and (
                    i - last_person_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    persons[-1] += " " + chunk[0]
                    last_person_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    persons[-1] += (" " if chunk[0].lower() in ["&", "and"]
                                    else "") + chunk[0]
                    last_person_pos = i
                else:
                    last_person_pos = None

        # Cleanup and yield
        for person in persons:
            # Cleanup
            person = person.strip()
            if len(person) <= 2:
                continue

            if PERSONS_STOP_WORDS.search(person):
                continue

            person = strip_unicode_punctuation(person).strip(
                string.punctuation).strip(string.whitespace)

            if contains_companies(person, companies):
                continue

            if person.lower().endswith(" and"):
                person = person[0:-4]
            elif person.endswith(" &"):
                person = person[0:-2]

            if return_source:
                yield person, sentence
            else:
                yield person
Пример #7
0
def get_noun_phrases(text,
                     strict=False,
                     return_source=False,
                     window=3,
                     valid_punctuation=None) -> Generator:
    """
    Get NNP phrases from text.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    valid_punctuation = valid_punctuation or VALID_PUNCTUATION
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        nnps = []
        last_nnp_pos = None
        for i, chunk in enumerate(sentence_pos):
            do_join = not strict and last_nnp_pos is not None and (
                i - last_nnp_pos) < window
            # Check label
            if chunk[1] in ["NNP", "NNPS"]:
                if do_join:
                    sep = "" if "(" in valid_punctuation and nnps[-1][
                        -1] == "(" else " "
                    nnps[-1] += sep + chunk[0]
                else:
                    nnps.append(chunk[0])
                last_nnp_pos = i
            elif do_join:
                if chunk[1] in ["CC"] or chunk[0] in valid_punctuation:
                    if chunk[0].lower() in ["or"]:
                        continue
                    nnps[-1] += (" " if chunk[0].lower() in ["&", "and", "("]
                                 else "") + chunk[0]
                    last_nnp_pos = i
                else:
                    last_nnp_pos = None

        # Clean up names and yield
        for nnp in nnps:
            # Cleanup
            nnp = nnp.strip()
            if len(nnp) <= 2:
                continue

            if nnp.lower().endswith(" and"):
                nnp = nnp[0:-4].strip()
            elif nnp.endswith(" &"):
                nnp = nnp[0:-2].strip()

            nnp = strip_unicode_punctuation(nnp).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield nnp, sentence
            else:
                yield nnp
Пример #8
0
def get_geopolitical(text,
                     strict=False,
                     return_source=False,
                     window=2) -> Generator:
    """
    Get GPEs from text.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        gpes = []
        last_gpe_pos = None
        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if type(chunk) == nltk.tree.Tree:
                # Check label
                if chunk.label() == 'GPE':
                    if not strict and last_gpe_pos is not None and (
                            i - last_gpe_pos) < window:
                        gpes[-1] += " " + " ".join([c[0] for c in chunk])
                    else:
                        gpes.append(" ".join([c[0] for c in chunk]))
                    last_gpe_pos = i
            elif not strict and last_gpe_pos is not None and (
                    i - last_gpe_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    gpes[-1] += " " + chunk[0]
                    last_gpe_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    gpes[-1] += (" " if chunk[0].lower() in ["&", "and"] else
                                 "") + chunk[0]
                    last_gpe_pos = i
                else:
                    last_gpe_pos = None

        # Clean up names and yield
        for gpe in gpes:
            # Cleanup
            gpe = gpe.strip()
            if len(gpe) <= 2:
                continue

            if gpe.lower().endswith(" and"):
                gpe = gpe[0:-4]
            elif gpe.endswith(" &"):
                gpe = gpe[0:-2]

            gpe = strip_unicode_punctuation(gpe).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield gpe, sentence
            else:
                yield gpe
Пример #9
0
def get_organizations(text,
                      strict=False,
                      return_source=False,
                      window=2) -> Generator:
    """
    Get organizations from text.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        organizations = []
        last_org_pos = None
        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if type(chunk) == nltk.tree.Tree:
                # Check label
                if chunk.label() in ['ORGANIZATION']:
                    if not strict and last_org_pos is not None and (
                            i - last_org_pos) < window:
                        organizations[-1] += " " + " ".join(
                            [c[0] for c in chunk])
                    else:
                        organizations.append(" ".join([c[0] for c in chunk]))
                    last_org_pos = i
            elif not strict and last_org_pos is not None and (
                    i - last_org_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    organizations[-1] += " " + chunk[0]
                    last_org_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    organizations[-1] += (" " if chunk[0].lower()
                                          in ["&", "and"] else "") + chunk[0]
                    last_org_pos = i
                else:
                    last_org_pos = None

        for org in organizations:
            # Cleanup
            org = org.strip()
            if len(org) <= 2:
                continue

            if org.lower().endswith(" and"):
                org = org[0:-4]
            elif org.endswith(" &"):
                org = org[0:-2]

            org = strip_unicode_punctuation(org).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield org, sentence
            else:
                yield org
Пример #10
0
    header='infer')

dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_clean/"
second_dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_tokenised/"
directory = os.fsencode(dir)

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    celex = filename.split(".txt", 1)[0]
    print(celex)
    f = open(dir + filename, "r", encoding='latin1').read()
    h = f.split("\nTitle: ", 1)
    title = h[1].split("\nText: ", 1)[0]
    print(title)
    text = h[1].split("\nText: ", 1)[1]
    tokens = ln.get_token_list(text, stopword=True)
    tokenstring = " ".join(tokens)

    for i in range(len(df)):
        if df.loc[i, 'CelexID'] == celex:

            if df.loc[i + 3, 'CelexID'] == celex:
                classification = df.loc[i, 'Classes']
                classification2 = df.loc[i + 1, 'Classes']
                classification3 = df.loc[i + 2, 'Classes']
                classification4 = df.loc[i + 3, 'Classes']
                docid = df.loc[i, 'DocID']

                with open(second_dir + str(docid) + '.txt',
                          "w",
                          encoding='latin1') as newfile:
 def process_document(cls, document_text: str):
     return [
         t for t in get_token_list(
             document_text, stopword=True, lowercase=True) if t.isalpha()
     ]